The Ultimate Guide to Practical Use of the Web Speech API



In this article, I want to share with you the results of the study of the main features Web Speech API



(further - WSA



).







Introduction



WSA



Is an experimental technology consisting of two interfaces: SpeechSynthesis



(interface for text-to-speech) and SpeechRecognition



(interface for speech recognition).







, MDN ( , , - ).







, Can I use:











, SpeechSynthesis



. , SpeechRecognition



. " ", SpeechRecognition



-, . , .







4 , 2 :







  1. , SpeechSynthesis



    , ()
  2. ""
  3. , , , ( ),


, .









:







<div id="wrapper">
  <h1>Speech Synthesis - Player</h1>
  <label
    >Text:
    <textarea id="textarea">!  ?</textarea>
  </label>
  <label
    >Voice:
    <select id="select"></select>
  </label>
  <label
    >Volume:
    <input id="volume" type="range" min="0" max="1" step="0.1" value="1" />
    <span>1</span>
  </label>
  <label
    >Rate:
    <input id="rate" type="range" min="0" max="3" step="0.5" value="1" />
    <span>1</span>
  </label>
  <label
    >Pitch:
    <input id="pitch" type="range" min="0" max="2" step="0.5" value="1" />
    <span>1</span>
  </label>
  <div id="buttons">
    <button class="speak">Speak</button>
    <button class="cancel">Cancel</button>
    <button class="pause">Pause</button>
    <button class="resume">Resume</button>
  </div>
</div>
      
      





(textarea



), ; (select



) , ; - (volume



), (rate



) (pitch



), (speak



), (cancel



), (pause



) (resume



) . , .







min



, max



, step



value



. , , .. . . id



, .







JavaScript



. SpeechSynthesisUtterance



("utterance" " "):







const U = new SpeechSynthesisUtterance()
      
      





( "", , , Chrome ):







let voices = speechSynthesis.getVoices()
      
      





getVoices()



voiceschanged



. "" :







speechSynthesis.onvoiceschanged = () => {
  voices = speechSynthesis.getVoices()
  populateVoices(voices)
}
      
      





( ) :







0: SpeechSynthesisVoice
  default: true
  lang: "de-DE"
  localService: false
  name: "Google Deutsch"
  voiceURI: "Google Deutsch"
      
      





:







function populateVoices(voices) {
  //      `option`  
  //   `option`   ,   -     
  voices.forEach((voice, index) => {
    select.options[index] = new Option(voice.name, index)
  })

  //     `Google `
  //    ,    `Microsoft`
  const defaultVoiceIndex = voices.findIndex(
    (voice) => voice.name === 'Google '
  )
  select.selectedIndex = defaultVoiceIndex
  //     
  initializeHandlers()
}
      
      





:







function initializeHandlers() {
  //     ,       
  U.onstart = () => console.log('Started')
  U.onend = () => console.log('Finished')
  U.onerror = (err) => console.error(err)
  //       
  U.onpause = () => console.log('Paused')
  U.onresume = () => console.log('Resumed')

  //   
  wrapper.onchange = ({ target }) => {
    if (target.type !== 'range') return
    handleChange(target)
  }

  //   
  buttons.addEventListener('click', ({ target: { className } }) => {
    // SpeechSynthesis     `speak()`, `cancel()`, `pause()`  `resume()`
    //   `speak()`   
    //  ,            `speaking`
    //    : `pending`  `paused`,       
    switch (className) {
      case 'speak':
        if (!speechSynthesis.speaking) {
          convertTextToSpeech()
        }
        break
      case 'cancel':
        return speechSynthesis.cancel()
      case 'pause':
        return speechSynthesis.pause()
      case 'resume':
        return speechSynthesis.resume()
      default:
        return
    }
  })
}
      
      





:







function handleChange(el) {
  el.nextElementSibling.textContent = el.value
}
      
      





:







function convertTextToSpeech() {
  //  
  const trimmed = textarea.value.trim()
  if (!trimmed) return
  //    `SpeechSynthesisUtterance`
  U.text = trimmed
  //   
  const voice = voices[select.value]
  //      
  U.voice = voice
  // 
  U.lang = voice.lang
  // 
  U.volume = volume.value
  // 
  U.rate = rate.value
  //  
  U.pitch = pitch.value
  //  !
  speechSynthesis.speak(U)
}
      
      





SpeechSynthesisUtterance



:







SpeechSynthesisUtterance
  lang: "ru-RU"
  onboundary: null
  onend: () => console.log('Finished')
  onerror: (err) => console.error(err)
  onmark: null
  onpause: () => console.log('Paused')
  onresume: () => console.log('Resumed')
  onstart: () => console.log('Started')
  pitch: 1
  rate: 1
  text: "!  ?"
  voice: SpeechSynthesisVoice { voiceURI: "Google ", name: "Google ", lang: "ru-RU", localService: false, default: false }
  volume: 1
      
      





:







window.onkeydown = ({ key }) => {
  switch (key.toLowerCase()) {
    case 's':
      if (!speechSynthesis.speaking) {
        convertTextToSpeech()
      }
      break
    case 'c':
      return speechSynthesis.cancel()
    case 'p':
      return speechSynthesis.pause()
    case 'r':
      return speechSynthesis.resume()
    default:
      return
  }
}
      
      





:











SpeechSynthesis



, , - , . .







SpeechSynthesis



. , , , , , - . , , .







:







<div id="wrapper">
  <h1>Speech Synthesis - Page Reader</h1>
  <div>
    <button class="play" tabindex="1"></button>
    <p>
      JavaScript β€”   . 
      -,    .
         ECMAScript ( ECMA-262).
    </p>
  </div>
  <div>
    <button class="play" tabindex="2"></button>
    <p>
      JavaScript       
         .     
            
      -.
    </p>
  </div>
  <div>
    <button class="play" tabindex="3"></button>
    <p>
        :  , 
      ,   , 
      ,     .
    </p>
  </div>
</div>
      
      





(div



) (play



) , , , ( JavaScript



). , tabindex



, tab



space



. , , tabindex



, tab



.







, :







//         
let voices = speechSynthesis.getVoices()
let defaultVoice

speechSynthesis.onvoiceschanged = () => {
  voices = speechSynthesis.getVoices()
  defaultVoice = voices.find((voice) => voice.name === 'Google ')

  wrapper.addEventListener('click', handleClick)
  window.addEventListener('keydown', handleKeydown)
}

const PLAY = 'play'
const PAUSE = 'pause'
const RESUME = 'resume'

function handleClick({ target }) {
  switch (target.className) {
    case PLAY:
      //    `play`     ,
      //        
      speechSynthesis.cancel()

      const { textContent } = target.nextElementSibling

      //    . 
      textContent.split('.').forEach((text) => {
        const trimmed = text.trim()
        if (trimmed) {
          const U = getUtterance(target, text)
          speechSynthesis.speak(U)
        }
      })
      break
    case PAUSE:
      // CSS-        
      // ``- /, `` - /, `` - 
      //  ``     ,   
      target.className = RESUME
      speechSynthesis.pause()
      break
    case RESUME:
      target.className = PAUSE
      speechSynthesis.resume()
      break
    default:
      break
  }
}

//   `escape`   
//    
function handleKeydown({ code }) {
  switch (code) {
    case 'Escape':
      return speechSynthesis.cancel()
    default:
      break
  }
}

function getUtterance(target, text) {
  const U = new SpeechSynthesisUtterance(text)
  U.voice = defaultVoice
  U.lang = defaultVoice.lang
  U.volume = 1
  U.rate = 1
  U.pitch = 1

  //         / 
  U.onstart = () => {
    console.log('Started')
    target.className = PAUSE
  }
  U.onend = () => {
    console.log('Finished')
    target.className = PLAY
  }
  U.onerror = (err) => console.error(err)

  return U
}
      
      





β€” ( (.



)), , (, ) β€” textContent.split('.').forEach(...)



. , 220 ( Chrome). text-to-long



( ), , , , SpeechSynthesis



( ). , .







:









""



, SpeechSynthesis



, , , , , , WSA



β€” SpeechRecoginition



.







:







<div id="wrapper">
  <h1>Speech Recognition - Dictaphone</h1>
  <textarea id="final_text" cols="30" rows="10"></textarea>
  <input type="text" id="interim_text" />
  <div id="buttons">
    <button class="start"></button>
    <button class="stop"></button>
    <button class="abort"></button>
    <button class="copy"></button>
    <button class="clear"></button>
  </div>
</div>
      
      





() (final_text



) ( ) (interim_text



), (buttons



). textarea



input



, , , . , , , Chrome .







(start



, stop



abort



), Clipboard API



.







:







let final_transcript = ''
let recognizing = false
      
      





, SpeechRecognition



:







// ,  `WSA`,  , , , `SpeechRecognition`  
const speechRecognition =
  window.SpeechRecognition || window.webkitSpeechRecognition
//   `SpeechRecognition`
const recognition = new speechRecognition()
//  `continuous` ,         
recognition.continuous = true
//   
recognition.interimResults = true
//     
recognition.maxAlternatives = 3
// 
recognition.lang = 'ru-RU'
      
      





, :







recognition.onstart = () => {
  console.log('  ')
}
recognition.onerror = ({ error }) => {
  console.error(error)
}
recognition.onend = () => {
  console.log('  ')
  //   ,     `true`
  if (!recognizing) return
  recognition.start()
}
      
      





SpeechRecognition



, , "" . "" . , :







const DICTIONARY = {
  : '.',
  : ',',
  : '?',
  : '!',
  : ':',
  : '-',
  : '\n',
  : '\t'
}
      
      





, , , SpeechGrammar



SpeechGrammarList



, JSpeech Grammar Format



, , .







/, . , . , , , , " ", " " .. , : '?'



, .







:







function editInterim(s) {
  return s
    .split(' ')
    .map((word) => {
      word = word.trim()
      return DICTIONARY[word.toLowerCase()] ? DICTIONARY[word.toLowerCase()] : word
    })
    .join(' ')
}

function editFinal(s) {
  return s.replace(/\s{1,}([\.+,?!:-])/g, '$1')
}
      
      





editInterim()



, , . , . word = word.trim()



, "" , .







editFinal()



β€” editInterim()



. , .







, , β€” result



. :







recognition.onresult = (e) => {
  //       
  let interim_transcript = ''
  //     ,      
  for (let i = e.resultIndex; i < e.results.length; i++) {
    //  `isFinal`   ,   
    if (e.results[i].isFinal) {
      //   
      const result = editInterim(e.results[i][0].transcript)
      //      
      final_transcript += result
    } else {
      //   ,      
      interim_transcript += e.results[i][0].transcript
    }
  }
   //     `input`
  interim_text.value = interim_transcript
  //   
  final_transcript = editFinal(final_transcript)
  //     `textarea`
  final_text.value = final_transcript
}
      
      





:







SpeechRecognitionEvent
  bubbles: false
  cancelBubble: false
  cancelable: false
  composed: false
  currentTarget: SpeechRecognition {grammars: SpeechGrammarList, lang: "ru-RU", continuous: true, interimResults: true, maxAlternatives: 3, …}
  defaultPrevented: false
  emma: null
  eventPhase: 0
  interpretation: null
  isTrusted: true
  path: []
  resultIndex: 1
  //     
  results: SpeechRecognitionResultList {0: SpeechRecognitionResult, 1: SpeechRecognitionResult, length: 2}
  returnValue: true
  srcElement: SpeechRecognition {grammars: SpeechGrammarList, lang: "ru-RU", continuous: true, interimResults: true, maxAlternatives: 3, …}
  target: SpeechRecognition {grammars: SpeechGrammarList, lang: "ru-RU", continuous: true, interimResults: true, maxAlternatives: 3, …}
  timeStamp: 59862.61999979615
  type: "result"
      
      





(SpeechRecognitionResultList) :







results: SpeechRecognitionResultList
  0: SpeechRecognitionResult
    0: SpeechRecognitionAlternative
      confidence: 0.7990190982818604
      transcript: ""
    isFinal: true
    length: 1
  length: 1
      
      





e.results[i][0].transcript



. , maxAlternatives = 3



, SpeechRecognitionAlternative



. ( 0



) .







, , :







buttons.onclick = ({ target }) => {
  switch (target.className) {
    case 'start':
      //     
      final_transcript = ''
      //  
      recognition.start()
      //      `true`
      recognizing = true
      //  `textarea`
      final_text.value = ''
      //  `input`
      interim_text.value = ''
      break
    case 'stop':
      //  
      recognition.stop()
      //       `false`
      recognizing = false
      break
    case 'abort':
      //  
      recognition.abort()
      recognizing = false
      break
    case 'copy':
      //    `textarea`   
      navigator.clipboard.writeText(final_text.value)
      //    
      target.textContent = ''
      const timerId = setTimeout(() => {
        target.textContent = ''
        clearTimeout(timerId)
      }, 3000)
      break
    case 'clear':
      //     
      final_transcript = ''
      //  `textarea`
      final_text.value = ''
      break
    default:
      break
  }
}
      
      





. "", , ( speechstart



), ( ), , " ". input



- " ", textarea



". ?". , .







:









, , . : () => removeLastWord()



( , typeof DICTIONARY[word] === 'function'



) :







function removeLastWord() {
  const oldStr = final_text.value
  const newStr = oldStr.substring(0, oldStr.lastIndexOf(' '))
  final_text.value = newStr
}
      
      





, β€” , , .









SpeechRecognition



, , .







, .







pages



:







// home.js
export default /*html*/ `
<div id="wrapper">
  <div>Section 1</div>
  <div>Section 2</div>
  <div>Section 3</div>
  <div>Section 4</div>
  <div>Section 5</div>
  <div>Section 6</div>
  <div>Section 7</div>
  <div>Section 8</div>
  <div>Section 9</div>
</div>
`

// product.js
export default /*html*/ `
<h1>This is the Product Page</h1>
`

// about.js
export default /*html*/ `
<h1>This is the About Page</h1>
`
      
      





:







import HomePage from './pages/home.js'
import ProductPage from './pages/product.js'
import AboutPage from './pages/about.js'

const { body } = document
body.innerHTML = HomePage
      
      





:







//   
const DOWN = 'down'
const UP = 'up'
const RIGHT = 'right'
const LEFT = 'left'

// 
const ACTIONS = {
  //   
  home: () => (body.innerHTML = HomePage),
  product: () => (body.innerHTML = ProductPage),
  about: () => (body.innerHTML = AboutPage),

  //  
  down: () => scroll(DOWN),
  up: () => scroll(UP),
  left: () => scroll(LEFT),
  right: () => scroll(RIGHT),

  //    
  light: () => body.removeAttribute('class'),
  dark: () => (body.className = 'dark')
}
      
      





SpeechRecognition



, , ( β€” : recognition.lang = 'en-US'



).







result



:







recognition.onresult = (e) => {
  //  
  for (let i = e.resultIndex; i < e.results.length; i++) {
    if (e.results[i].isFinal) {
      const result = e.results[i][0].transcript.toLowerCase()
      //    ,      ( ,  `product`   , ,       )
      console.log(result)
      //    ,      
      result
        .split(' ')
        .forEach((word) => {
          word = word.trim().toLowerCase()
          // ACTION[word] -  
          return ACTIONS[word] ? ACTIONS[word]() : ''
        })
    }
  }
}
      
      





:







function scroll(direction) {
  let newPosition
  switch (direction) {
    case DOWN:
      newPosition = scrollY + innerHeight
      break
    case UP:
      newPosition = scrollY - innerHeight
      break
    case RIGHT:
      newPosition = scrollX + innerWidth
      break
    case LEFT:
      newPosition = scrollX - innerWidth
      break
    default:
      break
  }
  if (direction === DOWN || direction === UP) {
    scrollTo({
      top: newPosition,
      behavior: 'smooth'
    })
  } else {
    scrollTo({
      left: newPosition,
      behavior: 'smooth'
    })
  }
}
      
      





, :







window.addEventListener('keydown', (e) => {
  e.preventDefault()
  switch (e.code) {
    //    
    case 'Space':
      recognition.start()
      recognizing = true
      break
    //  `escape`  
    case 'Escape':
      recognition.stop()
      recognizing = false
    default:
      break
  }
})
      
      





. , , :







  • home



    , product



    , about



    β€”
  • dark



    , light



    β€”
  • down



    , up



    , left



    , right



    β€” ( , scroll



    , , scroll down



    )


:











, WSA



JavaScript



, , "" -. , . , WSA



β€” , , , , .







, WSA



. , , SpeechSynthesis



SpeechRecognition



. , β€” , , , , , . : β€” , , - ""



,



, , ,



. , . .










.







10% !














All Articles