You need to enable JavaScript to run this app.
导航

实时语音翻译API

最近更新时间2022.09.14 16:28:06

首次发布时间2022.07.18 19:59:55

接口描述

实时语音翻译API集成语音识别、智能断句、文本翻译等技术能力,可以将实时语音流、音频文件识别成文字并翻译成目标语言,达到“边说边译”的效果。

接口限制

  • 支持语向:可在「语言支持」列表中查看实时语音翻译支持的的源语言及目标语言。

  • 音频参数要求

    • 采样率:16000hz

    • 采样位:16

    • 单声道

    • 格式:wav或pcm(格式错误会导致识别效果差,返回时间戳错误等一系列问题,所以一定要确保自己发送的格式正确。)

    • 音频内容需使用base64编码

接口说明

请求地址

wss://translate.volces.com/api/translate/speech/v1/

请求方法

WEBSOCKET

服务鉴权

使用火山引擎鉴权sdk,详情参照公共参数

参数名
Version2020-06-01
ActionSpeechTranslate
Path/api/translate/speech/v1/

请求体

服务使用websocket协议

  • Configuration建立websocket之后第一包发送配置包如下
字段类型是否为必须项说明备注
SourceLanguageString源语言仅支持zh ja en
TargetLanguage[String]目标语言可在语言支持中查询对应的语言代码
HotWordList[Object]热词在语音识别时,接口更倾向于识别成热词的内容
{
    "Configuration": {
        "SourceLanguage": "zh",
        "TargetLanguages": [
            "en"
        ],
        "HotWordList": [
            {
                "Word": "hello",
                "Scale": 1
            }
        ]
    }
}
  • AudioData发送Configuratoin之后可以发送二进制数组的音频数据包,采样率为16000hz,单声道,使用base64编码,推荐每100-200ms发送一包
字段类型是否为必须项说明备注
AudioDataString音频包的base64编码
{
    "AudioData": "YQ=="
}
  • End bool值,只要发送了就表示音频发送结束
字段类型是否为必须项说明备注
EndBool音频包发送完成时发不论true,false,server接收到包后会处理完剩余的音频包以及相关的翻译,全部结束后会断开websocket
{
    "End": true
}

返回体

字段类型说明备注
ResponseMetadataResponseMetadata通用字段
SubtitleObject包含翻译结果、时间、语言等信息的结果

Subtitle:

字段类型说明备注
TextString返回的结果当某两个结果的Sequence和Definite一致时,结果中的Text分别代表原文及其译文
BeginTimeInt该文本识别的开始时间
EndTimeInt该文本识别的结束时间
DefiniteBool文本是否确定
  • true:语音已结束
  • false:语音进行中
LanguageString语言
SequenceInt序列号文本片段的序号
{
    "Subtitle": {
        "Text": "i'm elise hugh this is ted talk daily",//返回的结果
        "BeginTime": 4070,//该文本识别的开始时间
        "EndTime": 6955,//该文本的识别的结束时间
        "Definite": false,//正在说话,文本还未确定
        "Language": "en",//语言
        "Sequence": 8 //序列号 sequence和definite一致的就是一个语言的不同文本
    },
    "ResponseMetaData": {
        "RequestId": "7088303501725501476",//每次连接有不同的reqId
        "Action": "SpeechTranslate",
        "Version": "2020-06-01",
        "Service": "translate",
        "Region": "cn-north-1"
    }
}

在接口错误时,错误信息会在ResponseMetadata.Error中,包含了错误类型Code:: String和错误信息Message:: String两个字段,其中错误码列表为:

错误码说明
-301音频间隔过长
-400请求参数错误,请求参数错误,具体错误可参考Message信息
-401未授权用户
-403无权限
-429请求过于频繁
-5xx翻译引擎内部错误
100001-100021通用错误

示例代码

使用Python

依赖 volc-sdk-python
依赖 websocket-client

# 需要引入https://github.com/volcengine/volc-sdk-python
import base64
import json
import threading
import time

import websocket
from volcengine.ApiInfo import ApiInfo
from volcengine.Credentials import Credentials
from volcengine.ServiceInfo import ServiceInfo
from volcengine.base.Service import Service

k_access_key = "k_access_key" # https://console.volcengine.com/iam/keymanage/
k_secret_key = "k_secret_key"
k_host = 'translate.volces.com'
k_path = '/api/translate/speech/v1/'
k_timeout = 5  # second
k_service_info = \
    ServiceInfo(k_host,
                {'Content-Type': 'application/json'},
                Credentials(k_access_key, k_secret_key, 'translate', 'cn-north-1'),
                5,
                5)
k_query = {
    'Action': 'SpeechTranslate',
    'Version': '2020-06-01'
}
k_api_info = {
    'SpeechTranslate': ApiInfo('GET', k_path, k_query, {}, {})
}

file_path = './audio.wav'  # 音频的文件


def get_websocket_url():
    service = Service(k_service_info, k_api_info)
    url = 'wss://' + k_host + k_path + '?' + service.get_sign_url("SpeechTranslate", {})
    return url


def send_audio(ws):
    file = open(file_path, 'rb')
    while True:
        chunk = file.read(200 * 32)
        if not chunk:
            break
        audio = {
            "AudioData": base64.b64encode(chunk).decode('utf-8')
        }
        ws.send(json.dumps(audio))
        time.sleep(0.2)
    end = {
        "End": True
    }
    ws.send(json.dumps(end))


def recv(ws):
    while True:
        print(ws.recv().decode("utf-8"))


if __name__ == '__main__':
    websocket.enableTrace(False)
    ws = websocket.WebSocket()
    ws.connect(get_websocket_url(),
               header=["Content-Type: application/json"])
    configuration = {
        "Configuration": {
            "SourceLanguage": "zh",
            "TargetLanguages": [
                "en"
            ],
            "HotWordList": [
                {
                    "Word": "hello",
                    "Scale": 1
                }
            ]
        }
    }
    ws.send(json.dumps(configuration))
    t2 = threading.Thread(target=recv, args=(ws,))
    t1 = threading.Thread(target=send_audio, args=(ws,))
    t2.start()
    t1.start()

使用Golang

依赖 vcloud-sdk-golang

package main

import (
        "bytes"
        "encoding/json"
        "io"
        "io/ioutil"
        "net/http"
        "net/url"
        "os"
        "sync"
        "time"

        "github.com/gorilla/websocket"
        "github.com/volcengine/volc-sdk-golang/base"
)

const (
        accessKey = "accessKey" // https://console.volcengine.com/iam/keymanage/
        secretKey = "secretKey"
)

const (
        kServiceVersion20200601 = "2020-06-01"
        kSpeechTranslateAction  = "SpeechTranslate"
        kHost                   = "translate.volces.com"
)

func fromLocal(path string) (*bytes.Reader, error) {
        f, err := os.Open(path)
        if err != nil {
                return nil, err
        }
        defer f.Close()
        data, err := ioutil.ReadAll(f)
        if err != nil {
                return nil, err
        }
        return bytes.NewReader(data), nil
}

func newServiceInfo(host string) *base.ServiceInfo {
        return &base.ServiceInfo{
                Timeout:     5 * time.Second,
                Host:        host,
                Header:      http.Header{},
                Credentials: base.Credentials{Region: base.RegionCnNorth1, Service: "translate"},
        }
}

func newClient(host string, accessKey string, secretKey string) *base.Client {
        client := base.NewClient(newServiceInfo(host), map[string]*base.ApiInfo{
                kSpeechTranslateAction: {
                        Method: http.MethodGet,
                        Path:   "/api/translate/speech/v1/",
                        Query: url.Values{
                                "Action":  []string{kSpeechTranslateAction},
                                "Version": []string{kServiceVersion20200601},
                        },
                },
        })
        client.SetAccessKey(accessKey)
        client.SetSecretKey(secretKey)
        return client
}

func newAudioData(data []byte) []byte {
        type AudioDataReq struct {
                AudioData []byte `json:"AudioData"`
        }
        audioData := &AudioDataReq{
                AudioData: data,
        }
        b, err := json.Marshal(audioData)
        if err != nil {
                println(err.Error())
                return []byte{0}
        }
        return b
}

func getWsSignUrl(host string, accessKey string, secretKey string) (string, error) {
        client := newClient(host, accessKey, secretKey)
        signUrl, err := client.GetSignUrl(kSpeechTranslateAction, nil)
        if err != nil {
                return "", err
        }
        clientUrl := url.URL{
                Scheme:   "wss",
                Path:     client.ApiInfoList[kSpeechTranslateAction].Path,
                Host:     client.ServiceInfo.Host,
                RawQuery: signUrl,
        }
        return clientUrl.String(), nil
}

func main() {
        var (
                group  = &sync.WaitGroup{}
                buffer = make([]byte, 200*32)
                dialer = &websocket.Dialer{}
                header = http.Header{}
        )
        r, err := fromLocal("audio.wav")
        if err != nil {
                panic(err)
                return
        }
        signUrl, err := getWsSignUrl(kHost, accessKey, secretKey)
        if err != nil {
                panic(err)
                return
        }
        group.Add(2)
        header.Set("content-type", "application/json")
        conn, resp, err := dialer.Dial(signUrl, header)
        if err != nil {
                println(resp.Status)
                panic(err)
        }

        if err = conn.WriteMessage(websocket.BinaryMessage, []byte(`{"Configuration":{"SourceLanguage":"en","TargetLanguages":["zh"]}}`)); err != nil {
                panic(err)
        }
        go func() {
                defer group.Done()
                for {
                        _, data, err := conn.ReadMessage()
                        if err != nil {
                                println(err.Error())
                                return
                        }
                        println(string(data))
                }
        }()
        go func() {
                defer group.Done()
                for {
                        _, err = r.Read(buffer)
                        if err != nil {
                                println(err.Error())
                                if err == io.EOF {
                                        if err = conn.WriteMessage(websocket.BinaryMessage, []byte(`{"End":true}`)); err != nil {
                                                panic(err)
                                        }
                                }
                                return
                        }
                        err = conn.WriteMessage(websocket.BinaryMessage, newAudioData(buffer))
                        if err != nil {
                                panic(err)
                                return
                        }
                        time.Sleep(200 * time.Millisecond)
                }
        }()
        group.Wait()
}

使用Java

<dependencies>
        <dependency>
            <groupId>javax.websocket</groupId>
            <artifactId>javax.websocket-client-api</artifactId>
            <version>{version}</version>
        </dependency>

        <dependency>
            <groupId>org.java-websocket</groupId>
            <artifactId>Java-WebSocket</artifactId>
            <version>1{version}</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-simple</artifactId>
            <version>{version}</version>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>{version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.volcengine/volc-sdk-java -->
        <dependency>
            <groupId>com.volcengine</groupId>
            <artifactId>volc-sdk-java</artifactId>
            <version>{version}</version>
        </dependency>

</dependencies>
//src/main/java/translate/TranslateConfig.java
package translate;

import com.volcengine.helper.Const;
import com.volcengine.model.ApiInfo;
import com.volcengine.model.Credentials;
import com.volcengine.model.ServiceInfo;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

public class TranslateConfig {
    public static String accessKey = "your access key";
    public static String secretKey = "your secret key";
    public static String api = "SpeechTranslate";
    public static String path = "/api/translate/speech/v1/";
    public static String host = "translate.volces.com";

    public static ServiceInfo serviceInfo = new ServiceInfo(
            new HashMap<>() {
                {
                    put(Const.CONNECTION_TIMEOUT, 5000);
                    put(Const.SOCKET_TIMEOUT, 5000);
                    put(Const.Host, host);
                    put(Const.Header, new ArrayList<Header>() {
                        {
                            add(new BasicHeader("Accept", "application/json"));
                        }
                    });
                    put(Const.Credentials, new Credentials(Const.REGION_CN_NORTH_1, "translate"));
                }
            }
    );
    public static Map<String, ApiInfo> apiInfoList = new HashMap<>() {
        {
            put(api, new ApiInfo(
                    new HashMap<>() {
                        {
                            put(Const.Method, "GET");
                            put(Const.Path, path);
                            put(Const.Query, new ArrayList<NameValuePair>() {
                                {
                                    add(new BasicNameValuePair("Action", api));
                                    add(new BasicNameValuePair("Version", "2020-06-01"));
                                }
                            });
                        }
                    }
            ));
        }
    };
}

// src/main/java/translate/TranslateService.java
package translate;

import com.volcengine.model.ApiInfo;
import com.volcengine.model.ServiceInfo;
import com.volcengine.service.BaseServiceImpl;

import java.util.Map;

public class TranslateService extends BaseServiceImpl {

    public TranslateService(ServiceInfo info, Map<String, ApiInfo> apiInfoList) {
        super(info, apiInfoList);
    }
}

// src/main/java/websocket/Client.java
package websocket;

import org.java_websocket.client.WebSocketClient;
import org.java_websocket.handshake.ServerHandshake;

import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

public class Client extends WebSocketClient {
    public Client(URI serverUri) {
        super(serverUri);
    }

    @Override
    public void onOpen(ServerHandshake serverHandshake) {
        System.out.println("onopen");
    }

    @Override
    public void onMessage(String s) {
        System.out.println("onmessage");
        System.out.println(s);
    }
    @Override
    public void onMessage(ByteBuffer message) {
        System.out.println("onmessage");
        System.out.println(new String(message.array(), StandardCharsets.UTF_8));
    }

    @Override
    public void onClose(int i, String s, boolean b) {

        System.out.println("onclose");
    }

    @Override
    public void onError(Exception e) {
        e.printStackTrace();
    }
}

//src/main/java/Main.java
import translate.TranslateConfig;
import translate.TranslateService;
import websocket.Client;

import java.io.File;
import java.io.FileInputStream;
import java.net.URI;
import java.util.Base64;

public class Main {

    public static void main(String[] args) throws Exception {
        File input = new File("audio.wav");

        TranslateService translateService = new TranslateService(TranslateConfig.serviceInfo, TranslateConfig.apiInfoList);
        translateService.setAccessKey(TranslateConfig.accessKey);
        translateService.setSecretKey(TranslateConfig.secretKey);

        String signUrl = translateService.getSignUrl(TranslateConfig.api, null);
        URI url = new URI("wss://" + TranslateConfig.host + TranslateConfig.path + "?" + signUrl);
        System.out.println(url);


        // open websocket

        Client client = new Client(url);

        client.connectBlocking();
        client.send("{\n" +
                "    \"Configuration\": {\n" +
                "        \"SourceLanguage\": \"zh\",\n" +
                "        \"TargetLanguages\": [\n" +
                "            \"en\"\n" +
                "        ],\n" +
                "        \"HotWordList\": [\n" +
                "            {\n" +
                "                \"Word\": \"hello\",\n" +
                "                \"Scale\": 1\n" +
                "            }\n" +
                "        ]\n" +
                "    }\n" +
                "}");


        byte[] buffer = new byte[200 * 32];
        int bytesLeft = 100 * 1024 * 1024;
        try (FileInputStream fis = new FileInputStream(input)) {
            while (bytesLeft > 0) {
                int read = fis.read(buffer, 0, Math.min(bytesLeft, buffer.length));
                if (read == -1) {
                    break;
                }
                client.send(bytesToMessage(buffer));
                Thread.sleep(200);
                bytesLeft -= read;
            }
        } finally {
            client.send("{\n" +
                    "    \"End\": true\n" +
                    "}");
        }
    }

    static String bytesToMessage(byte[] data) {
        String base64Data = Base64.getEncoder().encodeToString(data);
        return "{\n" +
                "    \"AudioData\": \"" +
                base64Data +
                "\"\n" +
                "}";
    }
}

使用Node.js

依赖 vcloud-sdk-nodejs
依赖 ws

import {Signer} from "@volcengine/openapi";
import {Credentials, RequestObj} from "@volcengine/openapi/lib/base/types";
import WebSocket from 'ws';
import * as fs from 'fs';
import {open} from 'node:fs/promises';
import {sleep} from "@volcengine/openapi/lib/services/rocketmq/utils/common";

const host = "translate.volces.com"
const path = "/api/translate/speech/v1/"

function getWebsocketUrl(): string {
    const openApiRequestData: RequestObj = {
        method: "GET",
        region: "cn-north-1",
        params: {
            Action: "SpeechTranslate",
            Version: "2020-06-01",
        },
        pathname: "/api/translate/speech/v1/"
    };

    const credentials: Credentials = {
        accessKeyId: "your access key",
        secretKey: "your secret key",
    };

    const signer = new Signer(openApiRequestData, "translate");
    return 'wss://' + host + path + '?' + signer.getSignUrl(credentials);
}

console.log(getWebsocketUrl());
const ws = new WebSocket(getWebsocketUrl());
ws.on('open', async () => {
    console.log("onopen")
    let configuration = {
        "Configuration": {
            "SourceLanguage": "en",
            "TargetLanguages": [
                "zh"
            ],
            "HotWordList": [
                {
                    "Word": "hello",
                    "Scale": 1
                }
            ],
            "Extra": {}
        }
    };
    ws.send(JSON.stringify(configuration));
    let readStream = fs.createReadStream('audio.wav');
    readStream.on('readable', async () => {
        let chunk = readStream.read(200 * 32);
        while (chunk !== null) {
            await sleep(200).catch(e => {
                console.error(e);
            })
            let audio = {
                "AudioData": Buffer.from(chunk).toString("base64")
            };
            ws.send(JSON.stringify(audio));
            chunk = readStream.read(200 * 32);
        }
        ws.send(JSON.stringify({
            "End": true
        }));
    })
});

ws.on('message', (data) => {
    console.log("%s", data);
});

ws.on("close", () => {
    console.log("onclose")
});

ws.on("error", (err) => {
    console.error(err)
});

ws.on("ping", () => {
    console.log("headers")
});
ws.on("pong", () => {
});