You need to enable JavaScript to run this app.
导航

WebSocket API

最近更新时间2023.05.19 16:19:43

首次发布时间2021.12.14 20:44:27

接入必读

请先查看接入必读了解具体接入方式,再参考此文档完成接入。

接口说明

  • 当前支持通过 WebSocket 协议在线调用
  • 请求内容包括:payload字段为将请求参数序列化后的json文本

公共参数

参考详细说明功能调用-通用协议-WebSocket

配置参数

payload配置参数为json字符串格式

字段描述类型是否必传默认值
text输入文本string否。text与ssml字段至少一个非空,若二者都非空则按照ssml字段-
ssml输入文本(SSML格式),与text字段至少一个非空string否。text与ssml字段至少一个非空,若二者都非空则按照ssml字段-
speaker发音人,具体见附录:发音人列表string-
audio_config补充参数object
audio_config.format输出音频编码格式,wav/mp3/aacstringmp3
audio_config.sample_rate输出音频采样率,可选值 [8000,16000,22050,24000,32000,44100,48000]number24000
audio_config.speech_rate语速,取值范围[-50,100],100代表2.0倍速,-50代表0.5倍数number0
audio_config.pitch_rate音调,取值范围[-12,12]number0
audio_config.enable_timestamp是否选择同时返回字与音素时间戳boolfalse

示例:

{
    "text": "欢迎使用文本转语音服务。",
    "speaker": "zh_female_qingxin",
    "audio_config":
    {
        "format": "wav",
        "sample_rate": 16000
    }
}

响应格式

响应中不同消息类型的接收:

  • Text message文本消息类型,包含控制事件和响应数据
  • Binary message二进制消息类型,接收 只包含二进制数据 的响应。当enable_timestamp=false时,服务端返回二进制消息类型。

文本消息类型响应的定义如下:

字段描述类型
task_id请求任务id,用于链路追踪、问题排查string
message_id请求任务消息id,用于链路追踪、问题排查string
namespace服务接口命名空间,比如TTSstring
event服务请求任务事件,比如StartTaskstring
data请求响应二进制数据,标准base64编码string
payload请求响应文本信息,json字符串格式string
status_code状态码number
status_text状态信息string
  • 响应结果payload为json字符串格式,json内容格式如下:

    字段描述类型
    duration音频时长,单位秒number
    words字的时间戳,单位秒。需要请求参数audio_config.enable_timestamp=truearray
    words.word字内容string
    words.start_time当前字开始时间number
    words.end_time当前字结束时间number
    phonemes音素的时间戳,单位秒。需要请求参数audio_config.enable_timestamp=truearray
    phonemes.phone音素内容string
    phonemes.start_time当前音素开始时间number
    phonemes.end_time当前音素结束时间number

    payload示例:

    {
        "duration": 3.0,
        "words":
        [
            {
                "word": "你",
                "start_time": "0",
                "end_time": "0.05"
            },
            ...
        ],
        "phonemes":
        [
            {
                "phone": "C0n",
                "start_time": "0",
                "end_time": "0.025"
            },
            ...
        ]
    }
    

参考示例

流式调用方式参考公共WebSocket流式协议

Golang

// Code sample:
// use websocket client to invoke SAMI Streaming Service

package main

import (
	"bytes"
	"encoding/json"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"net/url"
	"os"
	"os/signal"
	"time"

	"github.com/gorilla/websocket"
)

type WebSocketRequest struct {
	Token     string  `header:"SAMI-Token,required" json:"token,required" query:"token,required"`
	Appkey    string  `json:"appkey,required" query:"appkey,required" vd:"$!=''"`
	Namespace string  `json:"namespace,required" query:"namespace,required" vd:"$!=''"`
	Version   string  `json:"version,omitempty" query:"version"`
	Event     string  `json:"event,omitempty" query:"event"`
	Payload   *string `form:"payload" json:"payload,omitempty"`
	Data      []byte  `form:"data" json:"data,omitempty"`
	TaskId    string  `json:"task_id,omitempty" query:"task_id"`
}
type WebSocketResponse struct {
	TaskId     string  `form:"task_id,required" json:"task_id,required" query:"task_id,required"`
	MessageId  string  `form:"message_id,required" json:"message_id,required" query:"message_id,required"`
	Namespace  string  `form:"namespace,required" json:"namespace,required" query:"namespace,required"`
	Event      string  `form:"event,required" json:"event,required" query:"event,required"`
	StatusCode int32   `form:"status_code,required" json:"status_code,required" query:"status_code,required"`
	StatusText string  `form:"status_text,required" json:"status_text,required" query:"status_text,required"`
	Payload    *string `form:"payload,omitempty" json:"payload,omitempty" query:"payload,omitempty"`
	Data       []byte  `form:"data,omitempty" json:"data,omitempty" query:"data,omitempty"`
}

const (
	EventStartTask    = "StartTask"
	EventTaskStarted  = "TaskStarted"
	EventFinishTask   = "FinishTask"
	EventTaskFinished = "TaskFinished"
)

var (
	// websocket domain
	addr = flag.String("addr", "sami.bytedance.com", "http service address")
	// user auth
	token  = "your_token"
	appkey = "your_appkey"

	u url.URL
	c *websocket.Conn

	interrupt chan os.Signal
	done      chan struct{}
	err       error

	inputFile  = "input.pcm"
	outputFile = "output.wav"
)

func main() {
	flag.Parse()
	log.SetFlags(0)

	interrupt = make(chan os.Signal, 1)
	signal.Notify(interrupt, os.Interrupt)
	done = make(chan struct{})

	u = url.URL{Scheme: "wss", Host: *addr, Path: "/api/v1/ws"}
	log.Printf("connecting to %s", u.String())

	c, _, err = websocket.DefaultDialer.Dial(u.String(), nil)
	if err != nil {
		log.Fatal("dial:", err)
	}
	defer c.Close()

	streamingTTSTest("zh_female_qingxin", "Hello, 欢迎使用文本转语音服务", "wav", 16000, true)
	// streamingTTSTest("zh_female_qingxin", "Hello, 欢迎使用文本转语音服务", "wav", 16000, false)
}

func readTaskStartedEvent() error {
	msgType, message, err := c.ReadMessage()
	if err != nil {
		log.Println("read TaskStarted event failed, ", err)
		return err
	}
	if msgType != websocket.TextMessage {
		log.Println("read TaskStarted event failed, message type not TextMessage: ", msgType)
		return fmt.Errorf("MessageTypeNotMatch")
	}
	taskStartedEvent := &WebSocketResponse{}
	err = json.Unmarshal(message, taskStartedEvent)
	if err != nil {
		return err
	}
	if taskStartedEvent.Event != EventTaskStarted {
		log.Printf("read TaskStarted event failed, event type not match: %+v", *taskStartedEvent)
		return fmt.Errorf("EventTypeNotMatch")
	}
	log.Printf("read TaskStarted event: %+v", *taskStartedEvent)
	return nil
}

func streamingTTSTest(speaker, text, format string, sampleRate int, enableTimestamp bool) {
	var buf bytes.Buffer
	defer func() {
		log.Println("save bytes into file:", buf.Len())
		if buf.Len() > 0 {
			_ = ioutil.WriteFile(outputFile, buf.Bytes(), 0644)
		}
	}()

	// send control message
	payloadStr := fmt.Sprintf(
		`{"audio_config":{"format":"%v","speech_rate":0, "sample_rate":%v, "enable_timestamp":%t},"speaker":"%v","text":"%v"}`,
		format, sampleRate, enableTimestamp, speaker, text,
	)
	controlReq := &WebSocketRequest{
		Token:     token,
		TaskId:    "test_mock",
		Appkey:    appkey,
		Namespace: "TTS",
		Event:     EventStartTask,
		Payload:   &payloadStr,
	}
	controlMsg, _ := json.Marshal(controlReq)
	err = c.WriteMessage(websocket.TextMessage, controlMsg)
	if err != nil {
		log.Println("write:", err)
		return
	}
	if err = readTaskStartedEvent(); err != nil {
		log.Println("read failed, ", err)
		return
	}
	controlReq.Event = EventFinishTask
	controlMsg, _ = json.Marshal(controlReq)
	err = c.WriteMessage(websocket.TextMessage, controlMsg)
	if err != nil {
		log.Println("write:", err)
		return
	}

	go func() {
		defer close(done)
		isFirst := true
		startTime := time.Now()
		for {
			mt, message, err := c.ReadMessage()
			if err != nil {
				log.Println("read:", err)
				return
			}
			if isFirst {
				startTime = time.Now()
				isFirst = false
			}
			if mt == websocket.BinaryMessage {
				log.Printf("recv: byte[%v]", len(message))
				buf.Write(message)
			} else {
				wsResp := WebSocketResponse{}
				ttsPayload := ""
				err := json.Unmarshal(message, &wsResp)
				if err != nil {
					log.Printf("recv text message, parse failed")
				}
				if wsResp.Event == EventTaskFinished {
					log.Printf(
						"recv TaskFinished event: %+v, cost_time=%v", wsResp, time.Since(startTime).Milliseconds(),
					)
					return
				}
				if wsResp.Payload != nil {
					ttsPayload = *wsResp.Payload
				}
				buf.Write(wsResp.Data)
				log.Printf("recv: data=byte[%d], payload=%v", len(wsResp.Data), ttsPayload)
			}
		}
	}()

	for {
		select {
		case <-done:
			return
		case <-interrupt:
			log.Println("interrupt")

			// Cleanly close the connection by sending a close message and then
			// waiting (with timeout) for the server to close the connection.
			err := c.WriteMessage(
				websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, ""),
			)
			if err != nil {
				log.Println("write close:", err)
				return
			}
			select {
			case <-done:
			case <-time.After(5 * time.Second):
			}
			return
		}
	}
}

Python

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# author:bytedance

import asyncio
import base64
import json
import time
import uuid

import websockets

# 音频保存路径
result_path = "./output.wav"
payload = {
    "text": "Hello, 欢迎使用文本转语音服务",
    "speaker": "zh_female_qingxin",
    "audio_config": {
        "format": "wav",
        "speech_rate": 0,
        "enable_timestamp": False,
        "sample_rate": 16000
    }}


async def tts_ws():
    api_url = "ws://sami.bytedance.com/api/v1/ws"
    task_id = str(uuid.uuid4())
    req = {
        "token": "your_token",
        "appkey": "your_appkey",
        "namespace": "TTS",
        "event": "StartTask",
        "payload": json.dumps(payload),
        # "task_id": task_id
    }
    # try:
    st = time.perf_counter()
    flag = 0
    result_data = open(result_path, "wb+")
    async with websockets.connect(api_url, ping_interval=None) as ws:
        # 先发送开始事件
        await ws.send(json.dumps(req))
        # 然后发送该事件是否发送完成
        req["event"] = "FinishTask"
        first_package_time = None
        await ws.send(json.dumps(req))
        while True:
            res = await ws.recv()
            try:
                if isinstance(res, str):
                    print("receive text message, ", end="")
                    res_dict = json.loads(res)
                    if "data" in res_dict:
                        if flag == 0:
                            first_package_time = time.perf_counter() - st
                            flag = 1
                        result_data.write(base64.b64decode(res_dict["data"]))
                        # print(base64.b64decode(res_dict["data"]))
                        print(" data=byte[%d]" % len(res_dict["data"]), end="")
                    if "payload" in res_dict:
                        print(" payload=%s" % res_dict["payload"], end="")
                    print(" task_id=%s, event=%s status_code=%d status_text=%s" % (
                        res_dict["task_id"], res_dict["event"], res_dict["status_code"], res_dict["status_text"]))
                    if res_dict["status_code"] != 0:
                        print("task failed: ", res_dict)
                        await ws.close()
                        break
                    if res_dict["event"] == "TaskFinished":
                        await ws.close()
                        break
                else:
                    print("receive binary message, len=%d" % len(res))
                    result_data.write(res)
                    if flag == 0:
                        first_package_time = time.perf_counter() - st
                        flag = 1
                    # print(res)
            except Exception as e:
                print("exception", e)
                break
        if first_package_time is not None:
            print("首包时间:", first_package_time)
    result_data.close()


if __name__ == '__main__':
    # for payload in pay:
    tasks = [asyncio.ensure_future(tts_ws()) for i in range(1)]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    time.sleep(1)

常见问题

HTTP状态码
业务状态码错误信息错误说明解决办法
40040402004TTSInvalidSpeaker
TTS发音人设置无效
检查TTS发音人是否正确设置
40040402001TTSEmptyTextTTS未设置文本检查TTS文本是否设置
40040402002TTSInvalidTextTTS设置文本非法检查TTS文本与发音人可能不匹配、无可读内容
40040402003
TTSExceededTextLimit
TTS文本长度超限检查TTS文本是否超限。非流式接口上限为 1000 个utf-8字符;流式接口上限为 2000 个utf-8字符(包括空格、标点、汉字、字母等)