You need to enable JavaScript to run this app.
导航
接口说明-WebSocket
最近更新时间:2024.05.28 11:17:29首次发布时间:2023.10.11 15:02:45

接入必读

请先查看接入必读了解具体接入方式,再参考此文档完成接入。

功能介绍

VoiceConversionStream为用户提供声音转换能力,支持用户输入人声音频,并通过深度学习转化为其他指定的音色,高度保留输入语音的说话风格、情感变化、说话节奏。流式声音转换支持实时的转换效果。

  • 输入:原始说话语音二进制数据
  • 输出:转换音色后的具有目标发音人音色和韵律的语音二进制数据

接口说明

  • 当前支持通过 WebSocket 协议在线调用

  • 请求内容包括:

    • payload字段为将请求参数序列化后的json文本
    • data字段为将音频二进制文件按照base64格式编码(标准base64,RFC 4648)的文本
  • 使用备注:

注意项说明
功能限制说明避免直接拼接json文本,尽量使用转换库,避免造成转义符等导致json格式错误
输入音频格式支持暂只支持s16le
音频编码建议建议采样率16kHz,单通道
输出结果格式支持用户通过请求参数配置,格式暂只支持s16le,建议采样率24kHz,单通道

公共参数

参考详细说明功能调用-通用协议-WebSocket

配置参数

payload配置参数为json字符串格式

字段描述类型是否必传默认值
speaker发音人,具体见附录:发音人列表string-
audio_info输入音频信息object-
audio_info.format输入音频编码格式,暂仅支持s16lestring-
audio_info.sample_rate输入音频采样率,大于等于8000, 小于等于48000number-
audio_info.channel输入音频通道数 1/2number-
audio_config输出音频配置信息object-
audio_config.format输出音频编码格式,暂仅支持s16lestring-
audio_config.sample_rate输出音频采样率,大于等于8000, 小于等于48000number-
audio_config.channel输出音频通道数 1/2number-
extra补充参数object-
extra.downstream_align是否要对齐每一帧长度(除了首包和尾包)boolfalse

示例:

{
    "speaker": "zh_female_qingxin_stream",
    "audio_info": {
        "sample_rate": 16000,
        "channel": 1,
        "format": "s16le",
    },
    "audio_config": {
        "sample_rate": 24000,
        "channel": 1,
        "format": "s16le",
    },
}

响应格式

响应中不同消息类型的接收:

  • Text message文本消息类型,包含控制事件和响应数据
  • Binary message二进制消息类型,接收只包含二进制数据的响应。
    文本消息类型响应的定义如下:
字段描述类型
task_id请求任务id,用于链路追踪、问题排查string
message_id请求任务消息id,用于链路追踪、问题排查string
namespace服务接口命名空间,比如VoiceConversionStreamstring
event服务请求任务事件,比如StartTaskstring
status_code状态码number
status_text状态信息string

示例:

{
    "task_id": "***",
    "message_id": "***",
    "namespace": "VoiceConversionStream",
    "event": "TaskFinished",
    "status_code": 20000000,
    "status_text": "OK"
}

参考示例

流式调用方式参考WebSocket公共流式协议功能调用-通用协议-WebSocket

Golang

// Code sample:
// use websocket client to invoke SAMI Streaming Service
package main

import (
	"bytes"
	"encoding/binary"
	"encoding/json"
	"errors"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"net/http"
	"net/url"
	"os"
	"os/signal"
	"sync"
	"time"

	"github.com/gorilla/websocket"
)

type WebSocketRequest struct {
	Token     string  `header:"SAMI-Token,required" json:"token,required" query:"token,required"`
	Appkey    string  `json:"appkey,required" query:"appkey,required" vd:"$!=''"`
	Namespace string  `json:"namespace,required" query:"namespace,required" vd:"$!=''"`
	Version   string  `json:"version,omitempty" query:"version"`
	Event     string  `json:"event,omitempty" query:"event"`
	Payload   *string `form:"payload" json:"payload,omitempty"`
	Data      []byte  `form:"data" json:"data,omitempty"`
	TaskId    string  `json:"task_id,omitempty" query:"task_id"`
}

type WebSocketResponse struct {
	TaskId     string  `form:"task_id,required" json:"task_id,required" query:"task_id,required"`
	MessageId  string  `form:"message_id,required" json:"message_id,required" query:"message_id,required"`
	Namespace  string  `form:"namespace,required" json:"namespace,required" query:"namespace,required"`
	Event      string  `form:"event,required" json:"event,required" query:"event,required"`
	StatusCode int32   `form:"status_code,required" json:"status_code,required" query:"status_code,required"`
	StatusText string  `form:"status_text,required" json:"status_text,required" query:"status_text,required"`
	Payload    *string `form:"payload,omitempty" json:"payload,omitempty" query:"payload,omitempty"`
	Data       []byte  `form:"data,omitempty" json:"data,omitempty" query:"data,omitempty"`
}

const (
	EventStartTask    = "StartTask"
	EventTaskStarted  = "TaskStarted"
	EventFinishTask   = "FinishTask"
	EventTaskFinished = "TaskFinished"
)

var (
	// websocket domain
	addr = flag.String("addr", "sami.bytedance.com", "http service address")
	// user auth
	token  = "your_token"
	appkey = "your_appkey"

	inputAudioPath  = "/path/to/input/audio.wav"
	outputAudioPath = "/path/to/output/audio.wav"

	speaker   = "zh_female_qingxin_stream"
	namespace = "VoiceConversionStream"

	u url.URL
	c *websocket.Conn

	interrupt chan os.Signal
	err       error
)

func main() {
	flag.Parse()
	log.SetFlags(0)

	interrupt = make(chan os.Signal, 1)
	signal.Notify(interrupt, os.Interrupt)

	log.Printf("connecting to %s\n", u.String())
	start := time.Now()
	u = url.URL{Scheme: "wss", Host: *addr, Path: "/api/v1/ws"}
	requestHeader := http.Header{}
	c, _, err = websocket.DefaultDialer.Dial(u.String(), requestHeader)
	if err != nil {
		log.Fatal("dial:", err)
	}
	defer c.Close()
	fmt.Printf("connection cost time: %dms\n", time.Since(start).Milliseconds())

	voiceConversion(speaker)
}

func voiceConversion(speaker string) {
	var buf bytes.Buffer
	request := &WebSocketRequest{
		Token:     token,
		Appkey:    appkey,
		Namespace: namespace,
		Event:     EventStartTask,
	}
	inputSampleRate := 16000
	inputChannel := 1
	inputFormat := "s16le"
	outputSampleRate := 24000
	outputChannel := 1
	outputFormat := "s16le"
	testSpeaker := speaker
	voiceConversionPayload := VoiceConversionRequest{
		Speaker: &testSpeaker,
		Info: &AudioInfo{
			SampleRate: inputSampleRate,
			Channel:    inputChannel,
			Format:     inputFormat,
		},
		AudioConfig: &AudioConfig{
			Format:     outputFormat,
			Channel:    outputChannel,
			SampleRate: outputSampleRate,
		},
	}

	b, _ := json.Marshal(&voiceConversionPayload)
	plStr := string(b)
	request.Payload = &plStr
	fmt.Println("req payload: ", *request.Payload)
	controlMsg, _ := json.Marshal(request)

	_ = c.WriteMessage(websocket.TextMessage, controlMsg)
	if err = readTaskStartedEvent(); err != nil {
		return
	}
	testData, err := ioutil.ReadFile(inputAudioPath)
	if err != nil {
		panic(err)
	}
	testData, err = ReadWav2PCM(testData)
	if err != nil {
		panic(err)
	}
	fmt.Println(len(testData))
	for i := 0; i < 0; i++ {
		testData = append(testData, testData...)
	}
	wg := sync.WaitGroup{}
	wg.Add(2)
	start := time.Now()
	// send
	go func() {
		defer wg.Done()
		defer func() {
			request.Event = EventFinishTask
			request.Payload = nil
			controlMsg, _ = json.Marshal(request)
			err := c.WriteMessage(websocket.TextMessage, controlMsg)
			if err != nil {
				log.Println("read err=", err)
				return
			}
			fmt.Println("send end, time: ", time.Since(start))
		}()
		isFirst := true
		l := len(testData)
		step := 3200
		times := int(math.Ceil(float64(l) / float64(step)))
		for i := 0; i < times; i++ {
			var dataToSend []byte
			if (i+1)*step > len(testData) {
				dataToSend = testData[i*step:]
			} else {
				dataToSend = testData[i*step : (i+1)*step]
			}
			if isFirst {
				start = time.Now()
				isFirst = false
			}
			if len(dataToSend) > 0 {
				err := c.WriteMessage(websocket.BinaryMessage, dataToSend)
				if err != nil {
					log.Println("read err=", err)
					return
				}
			}
			fmt.Printf("send len[%v] at %v\n", len(dataToSend), time.Now().Format("2006-01-02T15:04:05.999"))
		}
	}()
	// recv
	go func() {
		defer wg.Done()
		isFirst := true
		for {
			mt, message, err := c.ReadMessage()
			if err != nil {
				log.Println("read err=", err)
				break
			}
			if mt == websocket.BinaryMessage {
				log.Printf("binary, recv: byte[%v] at %v", len(message), time.Now().Format("2006-01-02T15:04:05.999"))
				if isFirst {
					fmt.Println("first resp: ", time.Since(start), time.Now())
					isFirst = false
				}
				buf.Write(message)
			} else {
				if isFirst {
					fmt.Println("first resp: ", time.Since(start), time.Now())
					isFirst = false
				}
				wsResp := WebSocketResponse{}
				err := json.Unmarshal(message, &wsResp)
				if err != nil {
					log.Printf("recv text message, parse failed")
				}
				log.Println(string(message))
				if wsResp.Event == EventTaskFinished {
					log.Printf("recv TaskFinished event: %+v, cost_time=%vms", wsResp, time.Since(start).Milliseconds())
					break
				}
				if wsResp.Payload != nil {
					log.Println(*wsResp.Payload, time.Now())
				}
				buf.Write(wsResp.Data)
			}
		}
	}()
	wg.Wait()
	fmt.Printf("recv len: %v\n", buf.Len())
	wavBinaryData := ConvertPcm2Wav(buf.Bytes(), outputChannel, outputSampleRate, 16)
	if err = ioutil.WriteFile(outputAudioPath, wavBinaryData, 0600); err != nil {
		fmt.Printf("write file %v failed: %v\n", outputAudioPath, err)
	}
	fmt.Println("write file name:", outputAudioPath)
}

func readTaskStartedEvent() error {
	msgType, message, err := c.ReadMessage()
	if err != nil {
		log.Println("read TaskStarted event failed, ", err)
		return err
	}
	if msgType != websocket.TextMessage {
		log.Println("read TaskStarted event failed, message type not TextMessage: ", msgType)
		return fmt.Errorf("MessageTypeNotMatch")
	}
	taskStartedEvent := &WebSocketResponse{}
	err = json.Unmarshal(message, taskStartedEvent)
	if err != nil {
		log.Printf("Unmarshal failed, err=%v", err)
		return err
	}
	if taskStartedEvent.Event != EventTaskStarted {
		log.Printf("read TaskStarted event failed, event type not match: %+v", *taskStartedEvent)
		return fmt.Errorf("EventTypeNotMatch")
	}
	log.Printf("read TaskStarted event: %+v", *taskStartedEvent)
	return nil
}

type VoiceConversionRequest struct {
	Info        *AudioInfo   `json:"audio_info,omitempty"`
	AudioConfig *AudioConfig `json:"audio_config,omitempty"`
	Speaker     *string      `json:"speaker,omitempty"`
}

type AudioInfo struct {
	SampleRate int    `json:"sample_rate,omitempty"`
	Channel    int    `json:"channel,omitempty"`
	Format     string `json:"format,omitempty"`
}

type AudioConfig struct {
	Format     string `json:"format,omitempty"`
	SampleRate int    `json:"sample_rate,omitempty"` // default 24000, [8000, 16000, 22050, 24000, 32000, 44100, 48000]
	Channel    int    `json:"channel,omitempty"`     // 1, 2
}

// ConvertPcm2Wav Add wav file header, convert pcm data to wav format
func ConvertPcm2Wav(inData []byte, channels int, sampleRate int, bitsPerSample int) []byte {
	if inData == nil {
		return nil
	}
	dataSize := len(inData)
	wavHeader := make([]byte, 44)
	offset := 0
	// RIFF
	copy(wavHeader[offset:offset+4], "RIFF")
	offset += 4
	// File size
	binary.LittleEndian.PutUint32(wavHeader[offset:offset+4], uint32(dataSize+36))
	offset += 4
	// File type
	copy(wavHeader[offset:offset+4], "WAVE")
	offset += 4
	// Format chunk marker
	copy(wavHeader[offset:offset+4], "fmt ")
	offset += 4
	// Length of above format data
	binary.LittleEndian.PutUint32(wavHeader[offset:offset+4], uint32(16))
	offset += 4
	// Format type(1 for PCM)
	binary.LittleEndian.PutUint16(wavHeader[offset:offset+2], uint16(1))
	offset += 2
	// Channel num
	binary.LittleEndian.PutUint16(wavHeader[offset:offset+2], uint16(channels))
	offset += 2
	// Sample rate
	binary.LittleEndian.PutUint32(wavHeader[offset:offset+4], uint32(sampleRate))
	offset += 4
	// Byte rate
	binary.LittleEndian.PutUint32(wavHeader[offset:offset+4], uint32(sampleRate*channels*bitsPerSample/8))
	offset += 4
	// BlockAlign
	binary.LittleEndian.PutUint16(wavHeader[offset:offset+2], uint16(channels*bitsPerSample/8))
	offset += 2
	// Bits per sample
	binary.LittleEndian.PutUint16(wavHeader[offset:offset+2], uint16(bitsPerSample))
	offset += 2
	// "data" Chunk marker
	copy(wavHeader[offset:offset+4], "data")
	offset += 4
	// PCM data size
	binary.LittleEndian.PutUint32(wavHeader[offset:offset+4], uint32(dataSize))
	return append(wavHeader, inData...)
}

func ReadWav2PCM(wavData []byte) ([]byte, error) {
	if wavData == nil {
		return nil, nil
	}
	offset := 36
	wavlen := len(wavData)
	for offset < wavlen-4 {
		if string(wavData[offset:offset+4]) == "data" {
			return wavData[offset+8:], nil
		} else {
			offset += 2
		}
	}
	err := errors.New(fmt.Sprintf("pcm data not found in wav data, data length : %v", len(wavData)))
	return nil, err
}


常见问题

HTTP状态码业务状态码错误信息错误说明解决办法
40040000022IllegalPayload:InvalidSpeaker发音人设置无效检查发音人是否正确设置

附录

发音人列表

备注:当前仅上线中文发音人,其他语种敬请期待。

中文发音人

场景音色中文名性别调用参数speaker
通用配音清新女声zh_female_qingxin_stream
甜美女声zh_female_tianmei_stream
醇厚男声zh_male_chunhou_stream
磁性解说男声zh_male_commentate_stream
趣味卡通温柔男声(活力男声)zh_male_huoli_stream
童年伙伴(可爱男孩)zh_male_xiaohai_stream
趣味方言四川女声zh_female_sichuan_stream
曲风歌手嘻哈歌手zh_male_rap_stream
超自然音色邻家女孩zh_female_xiaoqian_stream
开朗男声zh_male_xiaojian_stream