Unity: Speech Detection and Synthesis Together

To be able to call out “fire” and “stop”, I made some edits to the `F3DPlayerTurretController.cs` script.

using UnityEngine;
using System.Collections;
using UnityWebGLSpeechDetection;

namespace Forge3D
{
    public class F3DPlayerTurretController : MonoBehaviour
    {
        RaycastHit hitInfo; // Raycast structure
        public F3DTurret turret;
        bool isFiring; // Is turret currently in firing state
        public F3DFXController fxController;

        // reference to the proxy
        private ISpeechDetectionPlugin _mSpeechDetectionPlugin = null;

        enum FireState
        {
            IDLE,
            DETECTED_FIRE,
            FIRE_ONCE,
            FIRE_IDLE,
            DETECTED_STOP,
            STOP_ONCE
        }

        // detect the word once in all updates
        private static FireState _sFireState = FireState.IDLE;

        // make sure all turrets detect the async word in their update event
        private static bool _sReadyForLateUpdate = false;

        // init the speech proxy
        private IEnumerator Start()
        {
            // get the singleton instance
            _mSpeechDetectionPlugin = ProxySpeechDetectionPlugin.GetInstance();

            // check the reference to the plugin
            if (null == _mSpeechDetectionPlugin)
            {
                Debug.LogError("Proxy Speech Detection Plugin is not set!");
                yield break;
            }

            // wait for plugin to become available
            while (!_mSpeechDetectionPlugin.IsAvailable())
            {
                yield return null;
            }

            // subscribe to events
            _mSpeechDetectionPlugin.AddListenerOnDetectionResult(HandleDetectionResult);

            // abort and clear existing words
            _mSpeechDetectionPlugin.Abort();
        }

        // Handler for speech detection events
        void HandleDetectionResult(object sender, SpeechDetectionEventArgs args)
        {
            if (null == args.detectionResult)
            {
                return;
            }
            SpeechRecognitionResult[] results = args.detectionResult.results;
            if (null == results)
            {
                return;
            }
            bool doAbort = false;
            foreach (SpeechRecognitionResult result in results)
            {
                SpeechRecognitionAlternative[] alternatives = result.alternatives;
                if (null == alternatives)
                {
                    continue;
                }
                foreach (SpeechRecognitionAlternative alternative in alternatives)
                {
                    if (string.IsNullOrEmpty(alternative.transcript))
                    {
                        continue;
                    }
                    string lower = alternative.transcript.ToLower();
                    Debug.LogFormat("Detected: {0}", lower);
                    if (lower.Contains("fire"))
                    {
                        if (_sFireState == FireState.IDLE)
                        {
                            _sFireState = FireState.DETECTED_FIRE;
                        }
                        doAbort = true;
                    }

                    if (lower.Contains("stop"))
                    {
                        if (_sFireState == FireState.FIRE_IDLE)
                        {
                            _sFireState = FireState.DETECTED_STOP;
                        }
                        doAbort = true;
                    }
                }
            }

            // abort detection on match for faster matching on words instead of complete sentences
            if (doAbort)
            {
                _mSpeechDetectionPlugin.Abort();
            }
        }

        // make the async detected word, detectable at the start of all the update events
        void LateUpdate()
        {
            if (_sReadyForLateUpdate)
            {
                _sReadyForLateUpdate = false;
                switch (_sFireState)
                {
                    case FireState.DETECTED_FIRE:
                        _sFireState = FireState.FIRE_ONCE;
                        break;
                    case FireState.FIRE_ONCE:
                        _sFireState = FireState.FIRE_IDLE;
                        break;
                    case FireState.DETECTED_STOP:
                        _sFireState = FireState.STOP_ONCE;
                        break;
                    case FireState.STOP_ONCE:
                        _sFireState = FireState.IDLE;
                        break;
                }
            }
        }

        void Update()
        {
            CheckForTurn();
            CheckForFire();

            // After update, use one late update to detect the async word
            _sReadyForLateUpdate = true;
        }

        void CheckForFire()
        {
            // Fire turret
            //if (!isFiring && Input.GetKeyDown(KeyCode.Mouse0))
            if (!isFiring && _sFireState == FireState.FIRE_ONCE)
            {
                isFiring = true;
                fxController.Fire();
            }

            // Stop firing
            //if (isFiring && Input.GetKeyUp(KeyCode.Mouse0))
            if (isFiring && _sFireState == FireState.STOP_ONCE)
            {
                isFiring = false;
                fxController.Stop();
            }
        }

To be able to call out the names of weapons and to add speech, I made some edits to the `F3DFXController` script.

using System.Collections;
using System;
using UnityEngine;
using UnityEngine.UI;
using UnityWebGLSpeechDetection;
using UnityWebGLSpeechSynthesis;

namespace Forge3D
{
// Weapon types
    public enum F3DFXType
    {
        Vulcan,
        SoloGun,
        Sniper,
        ShotGun,
        Seeker,
        RailGun,
        PlasmaGun,
        PlasmaBeam,
        PlasmaBeamHeavy,
        LightningGun,
        FlameRed,
        LaserImpulse
    }

    public class F3DFXController : MonoBehaviour
    {
        /// <summary>
        /// Voices drop down
        /// </summary>
        public Dropdown _mDropdownVoices = null;

        /// <summary>
        /// Reference to the proxy
        /// </summary>
        private ISpeechDetectionPlugin _mSpeechDetectionPlugin = null;

        /// <summary>
        /// Reference to the proxy
        /// </summary>
        private ISpeechSynthesisPlugin _mSpeechSynthesisPlugin = null;

        /// <summary>
        /// Reference to the supported voices
        /// </summary>
        private VoiceResult _mVoiceResult = null;

        /// <summary>
        /// Reference to the utterance, voice, and text to speak
        /// </summary>
        private SpeechSynthesisUtterance _mSpeechSynthesisUtterance = null;

        /// <summary>
        /// Track when the utterance is created
        /// </summary>
        private bool _mUtteranceSet = false;

        /// <summary>
        /// Track when the voices are created
        /// </summary>
        private bool _mVoicesSet = false;

        enum WeaponState
        {
            IDLE,
            DETECTED_LEFT,
            LEFT_ONCE,
            DETECTED_RIGHT,
            RIGHT_ONCE
        }

        // detect the word once in all updates
        private static WeaponState _sWeaponState = WeaponState.IDLE;

        // make sure all turrets detect the async word in their update event
        private static bool _sReadyForLateUpdate = false;

        // Singleton instance
        public static F3DFXController instance;

        // init the speech proxy
        private IEnumerator Start()
        {
            // get the singleton instance
            _mSpeechDetectionPlugin = ProxySpeechDetectionPlugin.GetInstance();

            // check the reference to the plugin
            if (null == _mSpeechDetectionPlugin)
            {
                Debug.LogError("Proxy Speech Detection Plugin is not set!");
                yield break;
            }

            // wait for plugin to become available
            while (!_mSpeechDetectionPlugin.IsAvailable())
            {
                yield return null;
            }

            _mSpeechSynthesisPlugin = ProxySpeechSynthesisPlugin.GetInstance();
            if (null == _mSpeechSynthesisPlugin)
            {
                Debug.LogError("Proxy Speech Synthesis Plugin is not set!");
                yield break;
            }

            // wait for proxy to become available
            while (!_mSpeechSynthesisPlugin.IsAvailable())
            {
                yield return null;
            }

            // subscribe to events
            _mSpeechDetectionPlugin.AddListenerOnDetectionResult(HandleDetectionResult);

            // abort and clear existing words
            _mSpeechDetectionPlugin.Abort();

            // Get voices from proxy
            GetVoices();

            // Create an instance of SpeechSynthesisUtterance
            _mSpeechSynthesisPlugin.CreateSpeechSynthesisUtterance((utterance) =>
            {
                //Debug.LogFormat("Utterance created: {0}", utterance._mReference);
                _mSpeechSynthesisUtterance = utterance;

                // The utterance is set
                _mUtteranceSet = true;

                // Set the default voice if ready
                SetIfReadyForDefaultVoice();
            });
        }

        /// <summary>
        /// Get voices from the proxy
        /// </summary>
        /// <returns></returns>
        private void GetVoices()
        {
            // get voices from the proxy
            _mSpeechSynthesisPlugin.GetVoices((voiceResult) =>
            {
                _mVoiceResult = voiceResult;

                // prepare the voices drop down items
                SpeechSynthesisUtils.PopulateVoicesDropdown(_mDropdownVoices, _mVoiceResult);

                // The voices are set
                _mVoicesSet = true;

                // Set the default voice if ready
                SetIfReadyForDefaultVoice();
            });
        }

        /// <summary>
        /// Set the default voice if voices and utterance are ready
        /// </summary>
        private void SetIfReadyForDefaultVoice()
        {
            if (_mVoicesSet &&
                _mUtteranceSet)
            {
                // set the default voice
                SpeechSynthesisUtils.SetDefaultVoice(_mDropdownVoices);

                // enable voices dropdown
                SpeechSynthesisUtils.SetInteractable(true, _mDropdownVoices);

                Voice voice = SpeechSynthesisUtils.GetVoice(_mVoiceResult, SpeechSynthesisUtils.GetDefaultVoice());

                _mSpeechSynthesisPlugin.SetVoice(_mSpeechSynthesisUtterance, voice);

                // drop down reference must be set
                if (_mDropdownVoices)
                {
                    // set up the drop down change listener
                    _mDropdownVoices.onValueChanged.AddListener(delegate {
                        // handle the voice change event, and set the voice on the utterance
                        SpeechSynthesisUtils.HandleVoiceChanged(_mDropdownVoices,
                            _mVoiceResult,
                            _mSpeechSynthesisUtterance,
                            _mSpeechSynthesisPlugin);
                    });
                }
            }
        }

        /// <summary>
        /// Speak the utterance
        /// </summary>
        private void Speak(string text)
        {
            if (!_mVoicesSet ||
                !_mUtteranceSet)
            {
                // not ready
                return;
            }

            // Cancel if already speaking
            _mSpeechSynthesisPlugin.Cancel();

            // Set the text that will be spoken
            _mSpeechSynthesisPlugin.SetText(_mSpeechSynthesisUtterance, text);

            // Use the plugin to speak the utterance
            _mSpeechSynthesisPlugin.Speak(_mSpeechSynthesisUtterance);
        }

        // Handler for speech detection events
        void HandleDetectionResult(object sender, SpeechDetectionEventArgs args)
        {
            if (null == args.detectionResult)
            {
                return;
            }
            SpeechRecognitionResult[] results = args.detectionResult.results;
            if (null == results)
            {
                return;
            }
            bool doAbort = false;
            foreach (SpeechRecognitionResult result in results)
            {
                SpeechRecognitionAlternative[] alternatives = result.alternatives;
                if (null == alternatives)
                {
                    continue;
                }
                foreach (SpeechRecognitionAlternative alternative in alternatives)
                {
                    if (string.IsNullOrEmpty(alternative.transcript))
                    {
                        continue;
                    }
                    string lower = alternative.transcript.ToLower();
                    Debug.LogFormat("Detected: {0}", lower);
                    if (lower.Contains("left"))
                    {
                        if (_sWeaponState == WeaponState.IDLE)
                        {
                            _sWeaponState = WeaponState.DETECTED_LEFT;
                        }
                        doAbort = true;
                        break;
                    }

                    else if (lower.Contains("right"))
                    {
                        if (_sWeaponState == WeaponState.IDLE)
                        {
                            _sWeaponState = WeaponState.DETECTED_RIGHT;
                        }
                        doAbort = true;
                        break;
                    }

                    else if (lower.Contains("lightning"))
                    {
                        if (DefaultFXType != F3DFXType.LightningGun)
                        {
                            DefaultFXType = F3DFXType.LightningGun;
                            Speak(string.Format("{0} is active, sir", DefaultFXType));
                        }
                        doAbort = true;
                        break;
                    }

                    else if (lower.Contains("beam"))
                    {
                        if (DefaultFXType != F3DFXType.PlasmaBeam)
                        {
                            DefaultFXType = F3DFXType.PlasmaBeam;
                            Speak(string.Format("{0} is active, sir", DefaultFXType));
                        }
                        doAbort = true;
                        break;
                    }
                }
            }

            // abort detection on match for faster matching on words instead of complete sentences
            if (doAbort)
            {
                _mSpeechDetectionPlugin.Abort();
            }
        }

        // make the async detected word, detectable at the start of all the update events
        void LateUpdate()
        {
            if (_sReadyForLateUpdate)
            {
                _sReadyForLateUpdate = false;
                switch (_sWeaponState)
                {
                    case WeaponState.DETECTED_LEFT:
                        _sWeaponState = WeaponState.LEFT_ONCE;
                        break;
                    case WeaponState.LEFT_ONCE:
                        _sWeaponState = WeaponState.IDLE;
                        break;
                    case WeaponState.DETECTED_RIGHT:
                        _sWeaponState = WeaponState.RIGHT_ONCE;
                        break;
                    case WeaponState.RIGHT_ONCE:
                        _sWeaponState = WeaponState.IDLE;
                        break;
                }
            }
        }

        void Update()
        {
            // Switch weapon types using keyboard keys
            //if (Input.GetKeyDown(KeyCode.RightArrow))
            if (_sWeaponState == WeaponState.LEFT_ONCE)
                NextWeapon();
            //else if (Input.GetKeyDown(KeyCode.LeftArrow))
            if (_sWeaponState == WeaponState.RIGHT_ONCE)
                PrevWeapon();

            // After update, use one late update to detect the async word
            _sReadyForLateUpdate = true;
        }