基于JavaScript/HTML实现Azure Speech实时语音转写及说话人区分功能的技术问询

阿华AIGC实验室

2026-4-13

我正在开发一个HTML+JavaScript网页应用，核心需求是：实时获取麦克风语音输入，完成转写的同时自动区分不同说话人，并且持续运行直到点击停止按钮。

目前基础的语音转写功能已经正常工作，但当我修改配置开启说话人区分（Diarization）后，发现转写文本能正常输出，却始终没有识别并标记说话人ID——尽管我已经在SpeechConfig里设置了启用说话人区分的参数。

以下是我调整后的完整代码，希望有人能帮我排查下问题所在：

<!DOCTYPE html>
<html>

<head>
    <title>Speech Sample</title>
    <meta charset="utf-8" />
    <script type="text/javascript" src="./difflib-browser.js"></script>
</head>

<body style="font-family:'Helvetica Neue',Helvetica,Arial,sans-serif; font-size:13px;">
    <div id="warning">
        <h1 style="font-weight:500;">Speech Recognition Speech SDK not found
            (microsoft.cognitiveservices.speech.sdk.bundle.js missing).</h1>
    </div>
    <div id="content" style="display:none">
        <table>
            <tr>
                <td></td>
                <td>
                    <h2 style="font-weight:500;">Microsoft Cognitive Services Speech SDK</h2>
                    <h3 style="font-weight:500;">Javascript Browser Sample</h3>
                </td>
            </tr>
            <tr>
                <td align="right"><a href="https://www.microsoft.com/cognitive-services/sign-up"
                        target="_blank">Subscription</a>:</td>
                <td><input id="key" type="text" size="60" placeholder="required: speech subscription key"></td>
            </tr>
            <tr>
                <td align="right">Region:</td>
                <td align="left">
                    <select id="regionOptions">
                        <option value="westus" selected="selected">West US</option>
                        <option value="westus2">West US 2</option>
                        <option value="eastus">East US</option>
                        <option value="eastus2">East US 2</option>
                        <option value="eastasia">East Asia</option>
                        <option value="southeastasia">South East Asia</option>
                        <option value="centralindia">Central India</option>
                        <option value="northeurope">North Europe</option>
                        <option value="westeurope">West Europe</option>
                    </select>
                </td>
            </tr>
            <tr>
                <td align="right">Recognition language:</td>
                <td align="left">
                    <select id="languageOptions">
                        <option value="en-US" selected="selected">English - US</option>
                        <!-- Add other languages as desired -->
                    </select>
                </td>
            </tr>
            <tr>
                <td align="right"><b></b></td>
                <td>
                    <button id="scenarioStartButton">Start</button>
                    <button id="scenarioStopButton" disabled="disabled">Stop</button>
                </td>
            </tr>
            <tr>
                <td align="right">Results:</td>
                <td align="left">
                    <textarea id="phraseDiv" style="display: inline-block;width:500px;height:200px"></textarea>
                </td>
            </tr>
            <tr>
                <td align="right">Events:</td>
                <td align="left">
                    <textarea id="statusDiv"
                        style="display: inline-block;width:500px;height:200px;overflow: scroll;white-space: nowrap;">
                    </textarea>
                </td>
            </tr>
        </table>
    </div>

    <!-- Speech SDK REFERENCE -->
    <script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>

    <!-- Speech SDK presence check -->
    <script>
        function Initialize(onComplete) {
            if (!!window.SpeechSDK) {
                document.getElementById('content').style.display = 'block';
                document.getElementById('warning').style.display = 'none';
                onComplete(window.SpeechSDK);
            }
        }
    </script>

    <script>
        var SpeechSDK;
        var phraseDiv, statusDiv;
        var key, authorizationToken;
        var regionOptions;
        var recognizer;
        
        document.addEventListener("DOMContentLoaded", function () {
            scenarioStartButton = document.getElementById('scenarioStartButton');
            scenarioStopButton = document.getElementById('scenarioStopButton');
            phraseDiv = document.getElementById("phraseDiv");
            statusDiv = document.getElementById("statusDiv");
            key = document.getElementById("key");
            regionOptions = document.getElementById("regionOptions");

            scenarioStartButton.addEventListener("click", function () {
                doContinuousRecognition();
            });

            scenarioStopButton.addEventListener("click", function() {
                if (recognizer) {
                    recognizer.stopContinuousRecognitionAsync();
                }
            });
        });

        function getAudioConfig() {
            return SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
        }

        function getSpeechConfig() {
            var speechConfig = SpeechSDK.SpeechConfig.fromSubscription(key.value, regionOptions.value);
            speechConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_EnableSpeakerDiarization, "true"); // Enable speaker diarization
            console.log("Speaker diarization enabled."); // Log confirmation
            return speechConfig;
        }

        function onRecognized(sender, recognitionEventArgs) {
            var result = recognitionEventArgs.result;
            console.log(result); // Log the entire result for debugging
            phraseDiv.scrollTop = phraseDiv.scrollHeight;
        
            var speakerId = result.speakerId ? ` [Speaker ID: ${result.speakerId}]` : '';
        
            statusDiv.innerHTML += `(recognized) Reason: ${SpeechSDK.ResultReason[result.reason]}`;
            phraseDiv.innerHTML += `${result.text}${speakerId}\r\n`;
        }

        function doContinuousRecognition() {
            var audioConfig = getAudioConfig();
            var speechConfig = getSpeechConfig();
            if (!audioConfig || !speechConfig) return;

            recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioConfig);
            recognizer.recognized = onRecognized;

            recognizer.startContinuousRecognitionAsync();
        }

        Initialize(async function (speechSdk) {
            SpeechSDK = speechSdk;
        });
    </script>
</body>

</html>

备注：内容来源于stack exchange，提问作者user29960912