将Azure TTS流式输出通过Node.js Socket发送到FreeSWITCH时仅听到无声的问题排查求助
将Azure TTS流式输出通过Node.js Socket发送到FreeSWITCH时仅听到无声的问题排查求助
大家好,我正在开发一个对接FreeSWITCH的AI语音机器人,核心流程是:
- FreeSWITCH通过WebSocket把用户的通话音频传给我的Node.js服务
- 服务用Azure STT完成语音转写,识别用户意图
- 生成回复文本后,用Azure TTS转为RAW PCM音频
- 把PCM按20ms的帧分片,通过WebSocket回传给FreeSWITCH,让用户听到机器人的回复
目前步骤1和2都正常工作,能正确转写用户的语音,但步骤3和4完成后,用户那边只能听到无声,完全没有机器人的语音输出。我已经检查过Azure TTS的配置,也确认生成的PCM数据有内容,但就是无法在通话中播放出来。
以下是我的完整代码,麻烦大家帮忙看看哪里可能出问题了:
import fs from "fs"; import path from "path"; import { WebSocketServer } from "ws"; import sdk from "microsoft-cognitiveservices-speech-sdk"; import pino from "pino"; import "dotenv/config"; /* ============================ CONFIG ============================ */ const WS_PORT = 3000; const RECORD_DIR = "./calls"; const logger = pino({ level: "info" }); if (!fs.existsSync(RECORD_DIR)) { fs.mkdirSync(RECORD_DIR, { recursive: true }); } /* ============================ AZURE BASE CONFIG ============================ */ const baseSpeechConfig = sdk.SpeechConfig.fromSubscription( process.env.AZURE_SPEECH_KEY, process.env.AZURE_SPEECH_REGION ); baseSpeechConfig.speechRecognitionLanguage = "en-IN"; /* ============================ WEBSOCKET SERVER ============================ */ const wss = new WebSocketServer({ port: WS_PORT, host: "127.0.0.1", perMessageDeflate: false }); logger.info(` audio_fork WS listening on 127.0.0.1:${WS_PORT}`); /* ============================ TTS → RAW PCM ============================ */ function synthesizeToRawPCM(text, sampleRate) { return new Promise((resolve, reject) => { const chunks = []; const speechConfig = sdk.SpeechConfig.fromSubscription( process.env.AZURE_SPEECH_KEY, process.env.AZURE_SPEECH_REGION ); speechConfig.speechSynthesisOutputFormat = sampleRate === 16000 ? sdk.SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm : sdk.SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm; const pullStream = sdk.AudioOutputStream.createPullStream(); const audioConfig = sdk.AudioConfig.fromStreamOutput(pullStream); const synth = new sdk.SpeechSynthesizer(speechConfig, audioConfig); synth.speakTextAsync( () => { let chunk; while ((chunk = pullStream.read()) !== undefined) { if (chunk?.byteLength) { chunks.push(Buffer.from(chunk)); } } synth.close(); resolve(Buffer.concat(chunks)); }, err => { synth.close(); reject(err); } ); }); } /* ============================ SEND AUDIO → FREESWITCH (20ms frames, realtime) ============================ */ async function sendAudio(ws, pcm, sampleRate) { if (ws.readyState !== ws.OPEN) return; const FRAME_BYTES = sampleRate === 16000 ? 640 : 320; for (let i = 0; i < pcm.length; i += FRAME_BYTES) { const frame = pcm.slice(i, i + FRAME_BYTES); ws.send(JSON.stringify({ type: "playAudio", data: { audioContentType: "raw", sampleRate, audioContent: frame.toString("base64") } })); await new Promise(r => setTimeout(r, 20)); } } /* ============================ CONNECTION HANDLER ============================ */ wss.on("connection", ws => { logger.info(" audio_fork connected"); const callId = Date.now(); let sampleRate = null; let speaking = false; let pushStream; let recognizer; const rawFile = path.join(RECORD_DIR, `call-${callId}.raw`); const transcriptFile = path.join(RECORD_DIR, `call-${callId}.txt`); const audioRecorder = fs.createWriteStream(rawFile, { flags: "a" }); const transcriptWriter = fs.createWriteStream(transcriptFile, { flags: "a" }); function initSTT(rate) { logger.info(` Initializing Azure STT @ ${rate} Hz`); const format = sdk.AudioStreamFormat.getWaveFormatPCM(rate, 16, 1); pushStream = sdk.AudioInputStream.createPushStream(format); const audioCfg = sdk.AudioConfig.fromStreamInput(pushStream); recognizer = new sdk.SpeechRecognizer(baseSpeechConfig, audioCfg); recognizer.recognized = async (_, e) => { if (e.result.reason !== sdk.ResultReason.RecognizedSpeech) return; if (speaking) return; const text = e.result.text?.trim(); if (!text) return; transcriptWriter.write(`USER: ${text}\n`); logger.info(` ${text}`); speaking = true; const reply = text.toLowerCase().includes("support") ? "Sure. Please describe your issue." : "Sorry, can you repeat that?"; transcriptWriter.write(`BOT: ${reply}\n`); try { const pcm = await synthesizeToRawPCM(reply, sampleRate); await sendAudio(ws, pcm, sampleRate); } catch (err) { logger.error(err, "TTS failed"); } speaking = false; }; recognizer.startContinuousRecognitionAsync(); } ws.on("message", (data, isBinary) => { if (!isBinary) { try { const meta = JSON.parse(data.toString()); if (meta?.sampling_rate) { if (meta.sampling_rate === "16k" || meta.sampling_rate === "16000" || meta.sampling_rate === "16") { sampleRate = 16000; } else if (meta.sampling_rate === "8k" || meta.sampling_rate === "8000" || meta.sampling_rate === "8") { sampleRate = 8000; } logger.info(` FreeSWITCH sampleRate=${sampleRate}`); if (!recognizer && sampleRate) { initSTT(sampleRate); } } } catch { } return; } audioRecorder.write(data); if (!speaking && pushStream) { pushStream.write(data); } }); ws.on("close", () => { logger.info(`❌ Call ${callId} ended`); audioRecorder.end(); transcriptWriter.end(); pushStream?.close(); recognizer?.stopContinuousRecognitionAsync(); }); }); /* ============================ SHUTDOWN ============================ */ process.on("SIGINT", () => { logger.info("Shutting down WS server"); wss.close(); process.exit(0); });
我自己怀疑过几个点,但还没找到问题:
- 音频格式匹配问题:我用的是和FreeSWITCH传来的采样率一致的RAW PCM(16位单声道),Azure TTS的输出格式也对应设置了,但会不会还有其他参数不匹配?
- 帧大小计算问题:16000Hz下20ms的帧应该是
16000*0.02*2=640字节,8000Hz下是8000*0.02*2=320字节,这个计算是对的吗? - WebSocket消息格式问题:我用的是
type: "playAudio"的JSON格式,FreeSWITCH的WebSocket音频播放是不是需要特定的消息格式? - PCM数据的字节序问题:Azure TTS输出的是小端还是大端?会不会和FreeSWITCH要求的不一致?
麻烦有经验的朋友帮忙看看,或者给我一些排查的方向,谢谢大家!




