基于JavaScript/HTML实现Azure Speech实时语音转写及说话人区分功能的技术问询
基于JavaScript/HTML实现Azure Speech实时语音转写及说话人区分功能的技术问询
我正在开发一个HTML+JavaScript网页应用,核心需求是:实时获取麦克风语音输入,完成转写的同时自动区分不同说话人,并且持续运行直到点击停止按钮。
目前基础的语音转写功能已经正常工作,但当我修改配置开启说话人区分(Diarization)后,发现转写文本能正常输出,却始终没有识别并标记说话人ID——尽管我已经在SpeechConfig里设置了启用说话人区分的参数。
以下是我调整后的完整代码,希望有人能帮我排查下问题所在:
<!DOCTYPE html> <html> <head> <title>Speech Sample</title> <meta charset="utf-8" /> <script type="text/javascript" src="./difflib-browser.js"></script> </head> <body style="font-family:'Helvetica Neue',Helvetica,Arial,sans-serif; font-size:13px;"> <div id="warning"> <h1 style="font-weight:500;">Speech Recognition Speech SDK not found (microsoft.cognitiveservices.speech.sdk.bundle.js missing).</h1> </div> <div id="content" style="display:none"> <table> <tr> <td></td> <td> <h2 style="font-weight:500;">Microsoft Cognitive Services Speech SDK</h2> <h3 style="font-weight:500;">Javascript Browser Sample</h3> </td> </tr> <tr> <td align="right"><a href="https://www.microsoft.com/cognitive-services/sign-up" target="_blank">Subscription</a>:</td> <td><input id="key" type="text" size="60" placeholder="required: speech subscription key"></td> </tr> <tr> <td align="right">Region:</td> <td align="left"> <select id="regionOptions"> <option value="westus" selected="selected">West US</option> <option value="westus2">West US 2</option> <option value="eastus">East US</option> <option value="eastus2">East US 2</option> <option value="eastasia">East Asia</option> <option value="southeastasia">South East Asia</option> <option value="centralindia">Central India</option> <option value="northeurope">North Europe</option> <option value="westeurope">West Europe</option> </select> </td> </tr> <tr> <td align="right">Recognition language:</td> <td align="left"> <select id="languageOptions"> <option value="en-US" selected="selected">English - US</option> <!-- Add other languages as desired --> </select> </td> </tr> <tr> <td align="right"><b></b></td> <td> <button id="scenarioStartButton">Start</button> <button id="scenarioStopButton" disabled="disabled">Stop</button> </td> </tr> <tr> <td align="right">Results:</td> <td align="left"> <textarea id="phraseDiv" style="display: inline-block;width:500px;height:200px"></textarea> </td> </tr> <tr> <td align="right">Events:</td> <td align="left"> <textarea id="statusDiv" style="display: inline-block;width:500px;height:200px;overflow: scroll;white-space: nowrap;"> </textarea> </td> </tr> </table> </div> <!-- Speech SDK REFERENCE --> <script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script> <!-- Speech SDK presence check --> <script> function Initialize(onComplete) { if (!!window.SpeechSDK) { document.getElementById('content').style.display = 'block'; document.getElementById('warning').style.display = 'none'; onComplete(window.SpeechSDK); } } </script> <script> var SpeechSDK; var phraseDiv, statusDiv; var key, authorizationToken; var regionOptions; var recognizer; document.addEventListener("DOMContentLoaded", function () { scenarioStartButton = document.getElementById('scenarioStartButton'); scenarioStopButton = document.getElementById('scenarioStopButton'); phraseDiv = document.getElementById("phraseDiv"); statusDiv = document.getElementById("statusDiv"); key = document.getElementById("key"); regionOptions = document.getElementById("regionOptions"); scenarioStartButton.addEventListener("click", function () { doContinuousRecognition(); }); scenarioStopButton.addEventListener("click", function() { if (recognizer) { recognizer.stopContinuousRecognitionAsync(); } }); }); function getAudioConfig() { return SpeechSDK.AudioConfig.fromDefaultMicrophoneInput(); } function getSpeechConfig() { var speechConfig = SpeechSDK.SpeechConfig.fromSubscription(key.value, regionOptions.value); speechConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_EnableSpeakerDiarization, "true"); // Enable speaker diarization console.log("Speaker diarization enabled."); // Log confirmation return speechConfig; } function onRecognized(sender, recognitionEventArgs) { var result = recognitionEventArgs.result; console.log(result); // Log the entire result for debugging phraseDiv.scrollTop = phraseDiv.scrollHeight; var speakerId = result.speakerId ? ` [Speaker ID: ${result.speakerId}]` : ''; statusDiv.innerHTML += `(recognized) Reason: ${SpeechSDK.ResultReason[result.reason]}`; phraseDiv.innerHTML += `${result.text}${speakerId}\r\n`; } function doContinuousRecognition() { var audioConfig = getAudioConfig(); var speechConfig = getSpeechConfig(); if (!audioConfig || !speechConfig) return; recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioConfig); recognizer.recognized = onRecognized; recognizer.startContinuousRecognitionAsync(); } Initialize(async function (speechSdk) { SpeechSDK = speechSdk; }); </script> </body> </html>
备注:内容来源于stack exchange,提问作者user29960912




