在进行对话式 AI 实时交互场景下,你可能需要实时字幕显示你与智能体的对话内容。在 StartVoiceChat
接口中配置 SubtitleConfig
结构体参数后,你可以开启房间内字幕回调功能。开启此功能后,你可在应用服务器和客户端接收字幕结果,以做后续操作。客户端通过 onRoomBinaryMessageReceived 接口实时返回字幕结果,你可以用来显示实时字幕。服务端通过设定的ServerMessageUrl
按照每句话返回字幕结果,你可以用来存储分析对话数据。
服务端
字幕回调格式
返回的字幕回调格式如下:
参数名 | 类型 | 描述 |
---|
message | String | Base 64 编码的二进制消息内容。格式参看二进制消息格式。 |
signature | String | 鉴权签名。可与StartVoiceChat 接口中传入的ServerMessageSignature 字段值进行对比以进行鉴权验证。 |
二进制消息格式如下:
参数名 | 类型 | 描述 |
---|
magic number | binary | 消息格式,固定为 subtitle 。 |
length | binary | 字幕消息长度,单位为 bytes。存放方式为大端序。 |
subtitle_message | binary | 字幕消息详细信息。格式参看subtitle_message 格式。 |
subtitle_message 格式:
参数名 | 类型 | 描述 |
---|
type | String | 消息格式,固定为 subtitle 。 |
data | data | 字幕详细信息。 |
data
参数名 | 类型 | 描述 |
---|
text | String | 字幕文本。 |
language | String | 字幕语言。 |
userId | String | 说话人 ID。 |
sequence | Int | 字幕序号。 |
definite | Boolean | 字幕是否为完整的一句话。 |
paragraph | Boolean | 字幕是否为一段完整的文本。 |
解析字幕信息内容
你可以参考以下示例代码对回调信息中的message
内容进行解析。
const (
subtitleHeader = "subv"
exampleSignature = "example_signature"
)
type RtsMessage struct {
Message string `json:"message"`
Signature string `json:"signature"`
}
type Subv struct {
Type string `json:"type"`
Data []Data `json:"data"`
}
type Data struct {
Definite bool `json:"definite"`
Paragraph bool `json:"paragraph"`
Language string `json:"language"`
Sequence int `json:"sequence"`
Text string `json:"text"`
UserID string `json:"userId"`
}
func HandleSubtitle(c *gin.Context) {
msg := &RtsMessage{}
if err := c.BindJSON(&msg); err != nil {
fmt.Printf("BindJson failed,err:%v\n", err)
return
}
if msg.Signature != exampleSignature {
fmt.Printf("Signature not match\n")
return
}
subv, err := Unpack(msg.Message)
if err != nil {
fmt.Printf("Unpack failed,err:%v\n", err)
return
}
fmt.Println(subv)
//业务逻辑
c.String(200, "ok")
}
func Unpack(msg string) (*Subv, error) {
data, err := base64.StdEncoding.DecodeString(msg)
if err != nil {
return nil, fmt.Errorf("DecodeString failed,err:%v", err)
}
if len(data) < 8 {
return nil, fmt.Errorf("Data invalid")
}
dataHeader := string(data[:4])
if dataHeader != subtitleHeader {
return nil, fmt.Errorf("Header not match")
}
dataSize := binary.BigEndian.Uint32(data[4:8])
if dataSize+8 != uint32(len(data)) {
return nil, fmt.Errorf("Size not match")
}
subData := data[8:]
subv := &Subv{}
err = json.Unmarshal(subData, subv)
if err != nil {
return nil, fmt.Errorf("Unmarshal failed,err:%v\n", err)
}
return subv, nil
}
func main() {
r := gin.Default()
r.POST("/example_domain/vertc/subtitle", HandleSubtitle)
r.Run()
}
客户端
字幕回调格式
参数名 | 类型 | 描述 |
---|
uid | String | 消息发送者 ID。 |
message | String | Base 64 编码的二进制消息内容。与服务端返回二进制消息格式相同,详细参看二进制消息格式。 |
解析字幕回调信息
你可以参考以下示例代码对回调信息中的message
内容进行解析。
//定义结构体
struct SubtitleMsgData {
bool definite;
std::string language;
int mode;
bool paragraph;
int sequence;
std::string text;
std::string userId;
};
//回调事件
void onRoomBinaryMessageReceived(const char* uid, int size, const uint8_t* message) {
std::string subtitles;
bool ret = Unpack(message, size, subtitles);
if(ret) {
ParseData(subtitles);
}
}
//拆包校验
bool Unpack(const uint8_t *message, int size, std::string& subtitles) {
int kSubtitleHeaderSize = 8;
if(size < kSubtitleHeaderSize) {
return false;
}
// magic number "subv"
if(static_cast<uint32_t>((static_cast<uint32_t>(message[0]) << 24)
| (static_cast<uint32_t>(message[1]) << 16)
| (static_cast<uint32_t>(message[2]) << 8)
| static_cast<uint32_t>(message[3])) != 0x73756276U) {
return false;
}
uint32_t length = static_cast<uint32_t>((static_cast<uint32_t>(message[4]) << 24)
| (static_cast<uint32_t>(message[5]) << 16)
| (static_cast<uint32_t>(message[6]) << 8)
| static_cast<uint32_t>(message[7]));
if(size - kSubtitleHeaderSize != length) {
return false;
}
if(length) {
subtitles.assign((char*)message + kSubtitleHeaderSize, length);
} else {
subtitles = "";
}
return true;
}
//解析
void ParseData(const std::string& msg) {
// 解析 JSON 字符串
nlohmann::json json_data = nlohmann::json::parse(subtitles);
// 存储解析后的数据
std::vector<SubtitleMsgData> subtitles;
// 遍历 JSON 数据并填充结构体
for (const auto& item : json_data["data"]) {
SubtitleMsgData subData;
subData.definite = item["definite"];
subData.language = item["language"];
subData.mode = item["mode"];
subData.paragraph = item["paragraph"];
subData.sequence = item["sequence"];
subData.text = item["text"];
subData.userId = item["userId"];
subtitles.push_back(subData);
}
}
//数据格式
public class SubtitleMsgData {
public boolean definite;
public String language;
public int mode;
public boolean paragraph;
public int sequence;
public String text;
public String userId;
@Override
public String toString() {
return "SubtitleMsgData{" +
"definite=" + definite +
", language='" + language + ''' +
", mode=" + mode +
", paragraph=" + paragraph +
", sequence=" + sequence +
", text='" + text + ''' +
", userId='" + userId + ''' +
'}';
}
}
//回调
public void onRoomBinaryMessageReceived(String uid, ByteBuffer buffer) {
StringBuilder subtitles = new StringBuilder();
buffer.flip(); // 切换到读取模式
boolean ret = unpack(buffer, subtitles);
if (ret) {
parseData(subtitles.toString());
}
}
// 拆包校验
public static boolean unpack(ByteBuffer message, StringBuilder subtitles) {
final int kSubtitleHeaderSize = 8;
if (message.remaining() < kSubtitleHeaderSize) {
return false;
}
// 魔法数字 "subv"
int magicNumber = (message.get() << 24) | (message.get() << 16) | (message.get() << 8) | (message.get());
if (magicNumber != 0x73756276) {
return false;
}
int length = Integer.reverseBytes(message.getInt());
if (message.remaining() != length) {
return false;
}
// 读取字幕内容
byte[] subtitleBytes = new byte[length];
message.get(subtitleBytes);
subtitles.append(new String(subtitleBytes, StandardCharsets.UTF_8));
return true;
}
// 解析字幕消息
public static void parseData(String msg) {
try {
// 解析 JSON 字符串
ObjectMapper objectMapper = new ObjectMapper();
JsonNode jsonData = objectMapper.readTree(msg);
// 存储解析后的数据
List<SubtitleMsgData> subtitles = new ArrayList<>();
// 遍历 JSON 数据并填充结构体
for (JsonNode item : jsonData.get("data")) {
SubtitleMsgData subData = new SubtitleMsgData();
subData.definite = item.get("definite").asBoolean();
subData.language = item.get("language").asText();
subData.mode = item.get("mode").asInt();
subData.paragraph = item.get("paragraph").asBoolean();
subData.sequence = item.get("sequence").asInt();
subData.text = item.get("text").asText();
subData.userId = item.get("userId").asText();
subtitles.add(subData);
}
} catch (Exception e) {
e.printStackTrace();
}
}
//数据格式
@interface SubtitleMsgData : NSObject
@property (nonatomic, assign) BOOL definite;
@property (nonatomic, copy) NSString *language;
@property (nonatomic, assign) NSInteger mode;
@property (nonatomic, assign) BOOL paragraph;
@property (nonatomic, assign) NSInteger sequence;
@property (nonatomic, copy) NSString *text;
@property (nonatomic, copy) NSString *userId;
@end
@implementation SubtitleMsgData
@end
//回调
- (void)rtcRoom:( ByteRTCRoom *_Nonnull)rtcRoom onRoomBinaryMessageReceived:(NSString *_Nonnull)uid message:(NSData *_Nonnull)message {
NSString *subtitles = unpack(buffer);
if (subtitles) {
parseData(subtitles);
}
}
// 大端序转换
uint32_t swapUInt32(uint32_t value) {
return ((value & 0x000000FF) << 24) |
((value & 0x0000FF00) << 8) |
((value & 0x00FF0000) >> 8) |
((value & 0xFF000000) >> 24);
}
//拆包校验
NSString *unpack(NSData *data) {
const int kSubtitleHeaderSize = 8;
NSUInteger size = data.length;
if (size < kSubtitleHeaderSize) {
return nil;
}
const uint8_t *message = data.bytes;
// Check magic number "subv"
uint32_t magic = (message[0] << 24) | (message[1] << 16) | (message[2] << 8) | message[3];
if (magic != 0x73756276) {
return nil;
}
// Get length
uint32_t length = (message[4] << 24) | (message[5] << 16) | (message[6] << 8) | message[7];
if (size - kSubtitleHeaderSize != length) {
return nil;
}
// Get subtitles
NSString *subtitles = nil;
if (length > 0) {
subtitles = [[NSString alloc] initWithBytes:message + kSubtitleHeaderSize length:length encoding:NSUTF8StringEncoding];
} else {
subtitles = @"";
}
return subtitles;
}
//解析
void parseData(NSString *msg) {
NSError *error = nil;
NSDictionary *json_data = [NSJSONSerialization JSONObjectWithData:[msg dataUsingEncoding:NSUTF8StringEncoding] options:0 error:&error];
if (error) {
NSLog(@"JSON Parse Error: %@", error);
return;
}
NSMutableArray<SubtitleMsgData *> *subtitles = [NSMutableArray array];
for (NSDictionary *item in json_data[@"data"]) {
SubtitleMsgData *subData = [[SubtitleMsgData alloc] init];
subData.definite = [item[@"definite"] boolValue];
subData.language = item[@"language"];
subData.mode = [item[@"mode"] integerValue];
subData.paragraph = [item[@"paragraph"] boolValue];
subData.sequence = [item[@"sequence"] integerValue];
subData.text = item[@"text"];
subData.userId = item[@"userId"];
[subtitles addObject:subData];
}
}