🛀🏻 🏇🏿 🌅 The Ultimate Guide to Practical Use of the Web Speech API 🤞🏿 🥢 🅾️

In this article, I want to share with you the results of the study of the main features Web Speech API

(further - WSA

).

Introduction

WSA

Is an experimental technology consisting of two interfaces: SpeechSynthesis

(interface for text-to-speech) and SpeechRecognition

(interface for speech recognition).

, MDN ( , , - ).

, Can I use:

, SpeechSynthesis

. , SpeechRecognition

. " ", SpeechRecognition

-, . , .

4 , 2 :

, SpeechSynthesis

, ()
""
, , , ( ),

, .

<div id="wrapper">
  <h1>Speech Synthesis - Player</h1>
  <label
    >Text:
    <textarea id="textarea">!  ?</textarea>
  </label>
  <label
    >Voice:
    <select id="select"></select>
  </label>
  <label
    >Volume:
    <input id="volume" type="range" min="0" max="1" step="0.1" value="1" />
    <span>1</span>
  </label>
  <label
    >Rate:
    <input id="rate" type="range" min="0" max="3" step="0.5" value="1" />
    <span>1</span>
  </label>
  <label
    >Pitch:
    <input id="pitch" type="range" min="0" max="2" step="0.5" value="1" />
    <span>1</span>
  </label>
  <div id="buttons">
    <button class="speak">Speak</button>
    <button class="cancel">Cancel</button>
    <button class="pause">Pause</button>
    <button class="resume">Resume</button>
  </div>
</div>

(textarea

), ; (select

) , ; - (volume

), (rate

) (pitch

), (speak

), (cancel

), (pause

) (resume

) . , .

min

, max

, step

value

. , , .. . . id

, .

JavaScript

. SpeechSynthesisUtterance

("utterance" " "):

const U = new SpeechSynthesisUtterance()

( "", , , Chrome ):

let voices = speechSynthesis.getVoices()

getVoices()

voiceschanged

. "" :

speechSynthesis.onvoiceschanged = () => {
  voices = speechSynthesis.getVoices()
  populateVoices(voices)
}

( ) :

0: SpeechSynthesisVoice
  default: true
  lang: "de-DE"
  localService: false
  name: "Google Deutsch"
  voiceURI: "Google Deutsch"

function populateVoices(voices) {
  //      `option`  
  //   `option`   ,   -     
  voices.forEach((voice, index) => {
    select.options[index] = new Option(voice.name, index)
  })

  //     `Google `
  //    ,    `Microsoft`
  const defaultVoiceIndex = voices.findIndex(
    (voice) => voice.name === 'Google '
  )
  select.selectedIndex = defaultVoiceIndex
  //     
  initializeHandlers()
}

function initializeHandlers() {
  //     ,       
  U.onstart = () => console.log('Started')
  U.onend = () => console.log('Finished')
  U.onerror = (err) => console.error(err)
  //       
  U.onpause = () => console.log('Paused')
  U.onresume = () => console.log('Resumed')

  //   
  wrapper.onchange = ({ target }) => {
    if (target.type !== 'range') return
    handleChange(target)
  }

  //   
  buttons.addEventListener('click', ({ target: { className } }) => {
    // SpeechSynthesis     `speak()`, `cancel()`, `pause()`  `resume()`
    //   `speak()`   
    //  ,            `speaking`
    //    : `pending`  `paused`,       
    switch (className) {
      case 'speak':
        if (!speechSynthesis.speaking) {
          convertTextToSpeech()
        }
        break
      case 'cancel':
        return speechSynthesis.cancel()
      case 'pause':
        return speechSynthesis.pause()
      case 'resume':
        return speechSynthesis.resume()
      default:
        return
    }
  })
}

function handleChange(el) {
  el.nextElementSibling.textContent = el.value
}

function convertTextToSpeech() {
  //  
  const trimmed = textarea.value.trim()
  if (!trimmed) return
  //    `SpeechSynthesisUtterance`
  U.text = trimmed
  //   
  const voice = voices[select.value]
  //      
  U.voice = voice
  // 
  U.lang = voice.lang
  // 
  U.volume = volume.value
  // 
  U.rate = rate.value
  //  
  U.pitch = pitch.value
  //  !
  speechSynthesis.speak(U)
}

SpeechSynthesisUtterance

:

SpeechSynthesisUtterance
  lang: "ru-RU"
  onboundary: null
  onend: () => console.log('Finished')
  onerror: (err) => console.error(err)
  onmark: null
  onpause: () => console.log('Paused')
  onresume: () => console.log('Resumed')
  onstart: () => console.log('Started')
  pitch: 1
  rate: 1
  text: "!  ?"
  voice: SpeechSynthesisVoice { voiceURI: "Google ", name: "Google ", lang: "ru-RU", localService: false, default: false }
  volume: 1

window.onkeydown = ({ key }) => {
  switch (key.toLowerCase()) {
    case 's':
      if (!speechSynthesis.speaking) {
        convertTextToSpeech()
      }
      break
    case 'c':
      return speechSynthesis.cancel()
    case 'p':
      return speechSynthesis.pause()
    case 'r':
      return speechSynthesis.resume()
    default:
      return
  }
}

SpeechSynthesis

, , - , . .

SpeechSynthesis

. , , , , , - . , , .

<div id="wrapper">
  <h1>Speech Synthesis - Page Reader</h1>
  <div>
    <button class="play" tabindex="1"></button>
    <p>
      JavaScript —   . 
      -,    .
         ECMAScript ( ECMA-262).
    </p>
  </div>
  <div>
    <button class="play" tabindex="2"></button>
    <p>
      JavaScript       
         .     
            
      -.
    </p>
  </div>
  <div>
    <button class="play" tabindex="3"></button>
    <p>
        :  , 
      ,   , 
      ,     .
    </p>
  </div>
</div>

(div

) (play

) , , , ( JavaScript

). , tabindex

, tab

space

. , , tabindex

, tab

.

, :

//         
let voices = speechSynthesis.getVoices()
let defaultVoice

speechSynthesis.onvoiceschanged = () => {
  voices = speechSynthesis.getVoices()
  defaultVoice = voices.find((voice) => voice.name === 'Google ')

  wrapper.addEventListener('click', handleClick)
  window.addEventListener('keydown', handleKeydown)
}

const PLAY = 'play'
const PAUSE = 'pause'
const RESUME = 'resume'

function handleClick({ target }) {
  switch (target.className) {
    case PLAY:
      //    `play`     ,
      //        
      speechSynthesis.cancel()

      const { textContent } = target.nextElementSibling

      //    . 
      textContent.split('.').forEach((text) => {
        const trimmed = text.trim()
        if (trimmed) {
          const U = getUtterance(target, text)
          speechSynthesis.speak(U)
        }
      })
      break
    case PAUSE:
      // CSS-        
      // ``- /, `` - /, `` - 
      //  ``     ,   
      target.className = RESUME
      speechSynthesis.pause()
      break
    case RESUME:
      target.className = PAUSE
      speechSynthesis.resume()
      break
    default:
      break
  }
}

//   `escape`   
//    
function handleKeydown({ code }) {
  switch (code) {
    case 'Escape':
      return speechSynthesis.cancel()
    default:
      break
  }
}

function getUtterance(target, text) {
  const U = new SpeechSynthesisUtterance(text)
  U.voice = defaultVoice
  U.lang = defaultVoice.lang
  U.volume = 1
  U.rate = 1
  U.pitch = 1

  //         / 
  U.onstart = () => {
    console.log('Started')
    target.className = PAUSE
  }
  U.onend = () => {
    console.log('Finished')
    target.className = PLAY
  }
  U.onerror = (err) => console.error(err)

  return U
}

— ( (.

)), , (, ) — textContent.split('.').forEach(...)

. , 220 ( Chrome). text-to-long

( ), , , , SpeechSynthesis

( ). , .

""

, SpeechSynthesis

, , , , , , WSA

— SpeechRecoginition

.

<div id="wrapper">
  <h1>Speech Recognition - Dictaphone</h1>
  <textarea id="final_text" cols="30" rows="10"></textarea>
  <input type="text" id="interim_text" />
  <div id="buttons">
    <button class="start"></button>
    <button class="stop"></button>
    <button class="abort"></button>
    <button class="copy"></button>
    <button class="clear"></button>
  </div>
</div>

() (final_text

) ( ) (interim_text

), (buttons

). textarea

input

, , , . , , , Chrome .

(start

, stop

abort

), Clipboard API

.

let final_transcript = ''
let recognizing = false

, SpeechRecognition

:

// ,  `WSA`,  , , , `SpeechRecognition`  
const speechRecognition =
  window.SpeechRecognition || window.webkitSpeechRecognition
//   `SpeechRecognition`
const recognition = new speechRecognition()
//  `continuous` ,         
recognition.continuous = true
//   
recognition.interimResults = true
//     
recognition.maxAlternatives = 3
// 
recognition.lang = 'ru-RU'

, :

recognition.onstart = () => {
  console.log('  ')
}
recognition.onerror = ({ error }) => {
  console.error(error)
}
recognition.onend = () => {
  console.log('  ')
  //   ,     `true`
  if (!recognizing) return
  recognition.start()
}

SpeechRecognition

, , "" . "" . , :

const DICTIONARY = {
  : '.',
  : ',',
  : '?',
  : '!',
  : ':',
  : '-',
  : '\n',
  : '\t'
}

, , , SpeechGrammar

SpeechGrammarList

, JSpeech Grammar Format

, , .

/, . , . , , , , " ", " " .. , : '?'

, .

function editInterim(s) {
  return s
    .split(' ')
    .map((word) => {
      word = word.trim()
      return DICTIONARY[word.toLowerCase()] ? DICTIONARY[word.toLowerCase()] : word
    })
    .join(' ')
}

function editFinal(s) {
  return s.replace(/\s{1,}([\.+,?!:-])/g, '$1')
}

editInterim()

, , . , . word = word.trim()

, "" , .

editFinal()

— editInterim()

. , .

, , — result

. :

recognition.onresult = (e) => {
  //       
  let interim_transcript = ''
  //     ,      
  for (let i = e.resultIndex; i < e.results.length; i++) {
    //  `isFinal`   ,   
    if (e.results[i].isFinal) {
      //   
      const result = editInterim(e.results[i][0].transcript)
      //      
      final_transcript += result
    } else {
      //   ,      
      interim_transcript += e.results[i][0].transcript
    }
  }
   //     `input`
  interim_text.value = interim_transcript
  //   
  final_transcript = editFinal(final_transcript)
  //     `textarea`
  final_text.value = final_transcript
}

SpeechRecognitionEvent
  bubbles: false
  cancelBubble: false
  cancelable: false
  composed: false
  currentTarget: SpeechRecognition {grammars: SpeechGrammarList, lang: "ru-RU", continuous: true, interimResults: true, maxAlternatives: 3, …}
  defaultPrevented: false
  emma: null
  eventPhase: 0
  interpretation: null
  isTrusted: true
  path: []
  resultIndex: 1
  //     
  results: SpeechRecognitionResultList {0: SpeechRecognitionResult, 1: SpeechRecognitionResult, length: 2}
  returnValue: true
  srcElement: SpeechRecognition {grammars: SpeechGrammarList, lang: "ru-RU", continuous: true, interimResults: true, maxAlternatives: 3, …}
  target: SpeechRecognition {grammars: SpeechGrammarList, lang: "ru-RU", continuous: true, interimResults: true, maxAlternatives: 3, …}
  timeStamp: 59862.61999979615
  type: "result"

(SpeechRecognitionResultList) :

results: SpeechRecognitionResultList
  0: SpeechRecognitionResult
    0: SpeechRecognitionAlternative
      confidence: 0.7990190982818604
      transcript: ""
    isFinal: true
    length: 1
  length: 1

e.results[i][0].transcript

. , maxAlternatives = 3

, SpeechRecognitionAlternative

. ( 0

) .

, , :

buttons.onclick = ({ target }) => {
  switch (target.className) {
    case 'start':
      //     
      final_transcript = ''
      //  
      recognition.start()
      //      `true`
      recognizing = true
      //  `textarea`
      final_text.value = ''
      //  `input`
      interim_text.value = ''
      break
    case 'stop':
      //  
      recognition.stop()
      //       `false`
      recognizing = false
      break
    case 'abort':
      //  
      recognition.abort()
      recognizing = false
      break
    case 'copy':
      //    `textarea`   
      navigator.clipboard.writeText(final_text.value)
      //    
      target.textContent = ''
      const timerId = setTimeout(() => {
        target.textContent = ''
        clearTimeout(timerId)
      }, 3000)
      break
    case 'clear':
      //     
      final_transcript = ''
      //  `textarea`
      final_text.value = ''
      break
    default:
      break
  }
}

. "", , ( speechstart

), ( ), , " ". input

- " ", textarea

". ?". , .

, , . : () => removeLastWord()

( , typeof DICTIONARY[word] === 'function'

) :

function removeLastWord() {
  const oldStr = final_text.value
  const newStr = oldStr.substring(0, oldStr.lastIndexOf(' '))
  final_text.value = newStr
}

, — , , .

SpeechRecognition

, , .

, .

pages

:

// home.js
export default /*html*/ `
<div id="wrapper">
  <div>Section 1</div>
  <div>Section 2</div>
  <div>Section 3</div>
  <div>Section 4</div>
  <div>Section 5</div>
  <div>Section 6</div>
  <div>Section 7</div>
  <div>Section 8</div>
  <div>Section 9</div>
</div>
`

// product.js
export default /*html*/ `
<h1>This is the Product Page</h1>
`

// about.js
export default /*html*/ `
<h1>This is the About Page</h1>
`

import HomePage from './pages/home.js'
import ProductPage from './pages/product.js'
import AboutPage from './pages/about.js'

const { body } = document
body.innerHTML = HomePage

//   
const DOWN = 'down'
const UP = 'up'
const RIGHT = 'right'
const LEFT = 'left'

// 
const ACTIONS = {
  //   
  home: () => (body.innerHTML = HomePage),
  product: () => (body.innerHTML = ProductPage),
  about: () => (body.innerHTML = AboutPage),

  //  
  down: () => scroll(DOWN),
  up: () => scroll(UP),
  left: () => scroll(LEFT),
  right: () => scroll(RIGHT),

  //    
  light: () => body.removeAttribute('class'),
  dark: () => (body.className = 'dark')
}

SpeechRecognition

, , ( — : recognition.lang = 'en-US'

).

result

:

recognition.onresult = (e) => {
  //  
  for (let i = e.resultIndex; i < e.results.length; i++) {
    if (e.results[i].isFinal) {
      const result = e.results[i][0].transcript.toLowerCase()
      //    ,      ( ,  `product`   , ,       )
      console.log(result)
      //    ,      
      result
        .split(' ')
        .forEach((word) => {
          word = word.trim().toLowerCase()
          // ACTION[word] -  
          return ACTIONS[word] ? ACTIONS[word]() : ''
        })
    }
  }
}

function scroll(direction) {
  let newPosition
  switch (direction) {
    case DOWN:
      newPosition = scrollY + innerHeight
      break
    case UP:
      newPosition = scrollY - innerHeight
      break
    case RIGHT:
      newPosition = scrollX + innerWidth
      break
    case LEFT:
      newPosition = scrollX - innerWidth
      break
    default:
      break
  }
  if (direction === DOWN || direction === UP) {
    scrollTo({
      top: newPosition,
      behavior: 'smooth'
    })
  } else {
    scrollTo({
      left: newPosition,
      behavior: 'smooth'
    })
  }
}

, :

window.addEventListener('keydown', (e) => {
  e.preventDefault()
  switch (e.code) {
    //    
    case 'Space':
      recognition.start()
      recognizing = true
      break
    //  `escape`  
    case 'Escape':
      recognition.stop()
      recognizing = false
    default:
      break
  }
})

. , , :

home

, product

, about

—
dark

, light

—
down

, up

, left

, right

— ( , scroll

, , scroll down

)

, WSA

JavaScript

, , "" -. , . , WSA

— , , , , .

, WSA

. , , SpeechSynthesis

SpeechRecognition

. , — , , , , , . : — , , - ""

,

, , ,

. , . .

10% !

The Ultimate Guide to Practical Use of the Web Speech API

Introduction

""

More articles: