视觉模型支持传入图片、视频和文档,完成图像描述、视觉问答、内容审核等视觉相关任务。
250615之后版本的视觉模型,如无特殊说明,默认支持 Responses API,具体请参见:视觉理解能力。
模型支持理解图片的信息,并结合这些信息完成如描述其中的物体等视觉相关任务。关于图片理解支持的图片输入方式及示例,请参见图片理解。
建议优先使用 Files API 上传本地文件,适用于图片文件较大,或者需要在多个请求中重复使用图片的场景。
该方式支持最大 512MB 的文件,默认存储 7 天,存储有效期取值范围为1-30天。
示例中用到的PDF文件及提示词如下:
图片文件 | 提示词 |
|---|---|
请给出图片中的内容,并根据图片内容回答支持输入图片的模型系列是哪个? |
代码示例:
上传图片文件获取File ID。
curl https://ark.cn-beijing.volces.com/api/v3/files \ -H "Authorization: Bearer $ARK_API_KEY" \ -F "purpose=user_data" \ -F "file=@/Users/doc/demo.png"
在Responses API中引用File ID。
curl https://ark.cn-beijing.volces.com/api/v3/responses \ -H "Authorization: Bearer $ARK_API_KEY" \ -H "Content-Type: application/json" \ -d '{ "model": "doubao-seed-1-6-251015", "input": [ { "role": "user", "content": [ { "type": "input_image", "file_id": "file-20251018****" }, { "type": "input_text", "text": "请给出图片中的内容,并根据图片内容回答支持输入图片的模型系列是哪个?" } ] } ] }'
import asyncio import os from volcenginesdkarkruntime import AsyncArk from volcenginesdkarkruntime.types.responses.response_completed_event import ResponseCompletedEvent from volcenginesdkarkruntime.types.responses.response_reasoning_summary_text_delta_event import ResponseReasoningSummaryTextDeltaEvent from volcenginesdkarkruntime.types.responses.response_output_item_added_event import ResponseOutputItemAddedEvent from volcenginesdkarkruntime.types.responses.response_text_delta_event import ResponseTextDeltaEvent from volcenginesdkarkruntime.types.responses.response_text_done_event import ResponseTextDoneEvent client = AsyncArk( base_url='https://ark.cn-beijing.volces.com/api/v3', api_key=os.getenv('ARK_API_KEY') ) async def main(): # upload pdf file print("Upload picture file") file = await client.files.create( # replace with your local picture path file=open("/Users/doc/ark_demo_img_1.png", "rb"), purpose="user_data" ) print(f"File uploaded: {file.id}") # Wait for the file to finish processing await client.files.wait_for_processing(file.id) print(f"File processed: {file.id}") stream = await client.responses.create( model="doubao-seed-1-6-251015", input=[ {"role": "user", "content": [ { "type": "input_image", "file_id": file.id # ref pdf file id }, { "type": "input_text", "text": "请给出图片中的内容,并根据图片内容回答支持输入图片的模型系列是哪个?" } ]}, ], caching={ "type": "enabled", }, store=True, stream=True ) async for event in stream: if isinstance(event, ResponseReasoningSummaryTextDeltaEvent): print(event.delta, end="") if isinstance(event, ResponseOutputItemAddedEvent): print("\noutPutItem " + event.type + " start:") if isinstance(event, ResponseTextDeltaEvent): print(event.delta,end="") if isinstance(event, ResponseTextDoneEvent): print("\noutPutTextDone.") if isinstance(event, ResponseCompletedEvent): print("Response Completed. Usage = " + event.response.usage.model_dump_json()) if __name__ == "__main__": asyncio.run(main())
package main import ( "context" "fmt" "io" "os" "time" "github.com/volcengine/volcengine-go-sdk/service/arkruntime" "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model/file" "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model/responses" "github.com/volcengine/volcengine-go-sdk/volcengine" ) func main() { client := arkruntime.NewClientWithApiKey( // Get API Key:https://console.volcengine.com/ark/region:ark+cn-beijing/apikey os.Getenv("ARK_API_KEY"), arkruntime.WithBaseUrl("https://ark.cn-beijing.volces.com/api/v3"), ) ctx := context.Background() fmt.Println("----- upload image data -----") data, err := os.Open("/Users/doc/ark_demo_img_1.png") if err != nil { fmt.Printf("read file error: %v\n", err) return } fileInfo, err := client.UploadFile(ctx, &file.UploadFileRequest{ File: data, Purpose: file.PurposeUserData, }) if err != nil { fmt.Printf("upload file error: %v", err) return } // Wait for the file to finish processing for fileInfo.Status == file.StatusProcessing { fmt.Println("Waiting for image to be processed...") time.Sleep(2 * time.Second) fileInfo, err = client.RetrieveFile(ctx, fileInfo.ID) // update file info if err != nil { fmt.Printf("get file status error: %v", err) return } } fmt.Printf("Image processing completed: %s, status: %s\n", fileInfo.ID, fileInfo.Status) inputMessage := &responses.ItemInputMessage{ Role: responses.MessageRole_user, Content: []*responses.ContentItem{ { Union: &responses.ContentItem_Image{ Image: &responses.ContentItemImage{ Type: responses.ContentItemType_input_image, FileId: volcengine.String(fileInfo.ID), }, }, }, { Union: &responses.ContentItem_Text{ Text: &responses.ContentItemText{ Type: responses.ContentItemType_input_text, Text: "请给出图片中的内容,并根据图片内容回答支持输入图片的模型系列是哪个?", }, }, }, }, } createResponsesReq := &responses.ResponsesRequest{ Model: "doubao-seed-1-6-251015", Input: &responses.ResponsesInput{ Union: &responses.ResponsesInput_ListValue{ ListValue: &responses.InputItemList{ListValue: []*responses.InputItem{{ Union: &responses.InputItem_InputMessage{ InputMessage: inputMessage, }, }}}, }, }, Caching: &responses.ResponsesCaching{Type: responses.CacheType_enabled.Enum()}, } resp, err := client.CreateResponsesStream(ctx, createResponsesReq) if err != nil { fmt.Printf("stream error: %v\n", err) return } var responseId string for { event, err := resp.Recv() if err == io.EOF { break } if err != nil { fmt.Printf("stream error: %v\n", err) return } handleEvent(event) if responseEvent := event.GetResponse(); responseEvent != nil { responseId = responseEvent.GetResponse().GetId() fmt.Printf("Response ID: %s", responseId) } } } func handleEvent(event *responses.Event) { switch event.GetEventType() { case responses.EventType_response_reasoning_summary_text_delta.String(): print(event.GetReasoningText().GetDelta()) case responses.EventType_response_reasoning_summary_text_done.String(): // aggregated reasoning text fmt.Printf("\nAggregated reasoning text: %s\n", event.GetReasoningText().GetText()) case responses.EventType_response_output_text_delta.String(): print(event.GetText().GetDelta()) case responses.EventType_response_output_text_done.String(): // aggregated output text fmt.Printf("\nAggregated output text: %s\n", event.GetTextDone().GetText()) default: return } }
package com.ark.example; import com.volcengine.ark.runtime.model.files.FileMeta; import com.volcengine.ark.runtime.model.files.PreprocessConfigs; import com.volcengine.ark.runtime.model.files.UploadFileRequest; import com.volcengine.ark.runtime.model.files.Video; import com.volcengine.ark.runtime.model.responses.content.InputContentItemImage; import com.volcengine.ark.runtime.service.ArkService; import com.volcengine.ark.runtime.model.responses.request.*; import com.volcengine.ark.runtime.model.responses.item.ItemEasyMessage; import com.volcengine.ark.runtime.model.responses.constant.ResponsesConstants; import com.volcengine.ark.runtime.model.responses.item.MessageContent; import com.volcengine.ark.runtime.model.responses.content.InputContentItemVideo; import com.volcengine.ark.runtime.model.responses.content.InputContentItemText; import com.volcengine.ark.runtime.model.responses.event.functioncall.FunctionCallArgumentsDoneEvent; import com.volcengine.ark.runtime.model.responses.event.outputitem.OutputItemAddedEvent; import com.volcengine.ark.runtime.model.responses.event.outputitem.OutputItemDoneEvent; import com.volcengine.ark.runtime.model.responses.event.outputtext.OutputTextDeltaEvent; import com.volcengine.ark.runtime.model.responses.event.outputtext.OutputTextDoneEvent; import com.volcengine.ark.runtime.model.responses.event.reasoningsummary.ReasoningSummaryTextDeltaEvent; import com.volcengine.ark.runtime.model.responses.event.response.ResponseCompletedEvent; import java.io.File; import java.util.concurrent.TimeUnit; public class demo { public static void main(String[] args) { String apiKey = System.getenv("ARK_API_KEY"); ArkService service = ArkService.builder().apiKey(apiKey).baseUrl("https://ark.cn-beijing.volces.com/api/v3").build(); System.out.println("===== Upload File Example====="); // upload a image for responses FileMeta fileMeta; fileMeta = service.uploadFile( UploadFileRequest.builder(). file(new File("/Users/doc/ark_demo_img_1.png")) // replace with your image file path .purpose("user_data") .build()); System.out.println("Uploaded file Meta: " + fileMeta); System.out.println("status:" + fileMeta.getStatus()); try { while (fileMeta.getStatus().equals("processing")) { System.out.println("Waiting for image to be processed..."); TimeUnit.SECONDS.sleep(2); fileMeta = service.retrieveFile(fileMeta.getId()); } } catch (Exception e) { System.err.println("get file status error:" + e.getMessage()); } System.out.println("Uploaded file Meta: " + fileMeta); CreateResponsesRequest request = CreateResponsesRequest.builder() .model("doubao-seed-1-6-251015") .stream(true) .input(ResponsesInput.builder().addListItem( ItemEasyMessage.builder().role(ResponsesConstants.MESSAGE_ROLE_USER).content( MessageContent.builder() .addListItem(InputContentItemImage.builder().fileId(fileMeta.getId()).build()) .addListItem(InputContentItemText.builder().text("请给出图片中的内容,并根据图片内容回答支持输入图片的模型系列是哪个?").build()) .build() ).build() ).build()) .build(); service.streamResponse(request) .doOnError(Throwable::printStackTrace) .blockingForEach(event -> { if (event instanceof ReasoningSummaryTextDeltaEvent) { System.out.print(((ReasoningSummaryTextDeltaEvent) event).getDelta()); } if (event instanceof OutputItemAddedEvent) { System.out.println("\nOutputItem " + (((OutputItemAddedEvent) event).getItem().getType()) + " Start: "); } if (event instanceof OutputTextDeltaEvent) { System.out.print(((OutputTextDeltaEvent) event).getDelta()); } if (event instanceof OutputTextDoneEvent) { System.out.println("\nOutputText End."); } if (event instanceof OutputItemDoneEvent) { System.out.println("\nOutputItem " + ((OutputItemDoneEvent) event).getItem().getType() + " End."); } if (event instanceof FunctionCallArgumentsDoneEvent) { System.out.println("\nFunctionCall Arguments: " + ((FunctionCallArgumentsDoneEvent) event).getArguments()); } if (event instanceof ResponseCompletedEvent) { System.out.println("\nResponse Completed. Usage = " + ((ResponseCompletedEvent) event).getResponse().getUsage()); } }); service.shutdownExecutor(); } }
import os import time from openai import OpenAI api_key = os.getenv('ARK_API_KEY') client = OpenAI( base_url='https://ark.cn-beijing.volces.com/api/v3', api_key=api_key, ) file = client.files.create( file=open("/Users/doc/ark_demo_img_1.png", "rb"), purpose="user_data" ) # Wait for the file to finish processing while (file.status == "processing"): time.sleep(2) file = client.files.retrieve(file.id) print(f"File processed: {file}") response = client.responses.create( model="doubao-seed-1-6-251015", input=[ { "role": "user", "content": [ { "type": "input_image", "file_id": file.id, }, { "type": "input_text", "text": "请给出图片中的内容,并根据图片内容回答支持输入图片的模型系列是哪个?", }, ] } ], stream=True ) for event in response: if event.type == "response.reasoning_summary_text.delta": print(event.delta, end="") if event.type == "response.output_item.added": print("\noutPutItem " + event.type + " start:") if event.type == "response.output_text.delta": print(event.delta,end="") if event.type == "response.output_item.done": print("\noutPutTextDone.") if event.type == "response.completed": print("\nResponse Completed. Usage = " + event.response.usage.model_dump_json())
输出示例:
### 图片内容描述 图片展示了不同模型系列的输入输出支持情况说明。上方文字提示可结合业务的输入输出信息类型筛选模型,并标注“×:不支持;√:支持”。下方表格包含“模型系列”“输入(文本、图像、音频)”“输出(文本、图像、音频)”等列,列出三个模型系列的支持情况: - **Doubao-1.5-pro**:支持输入文本、输出文本,不支持图像、音频的输入输出; - **Doubao-1.5-lite**:支持输入文本、输出文本,不支持图像、音频的输入输出; - **Doubao-1.5-vision**:支持输入文本和图像,输出文本,不支持音频输入输出及图像、音频输出。 ### 支持输入图像的模型系列 Doubao-1.5-vision
模型可理解视频中的视觉信息,可以完成如描述其中的物体、分析动作逻辑等视觉相关任务。关于视频理解支持的文件输入方式及示例,请参见视频理解。
建议优先使用 Files API 上传本地视频文件,适用于视频文件较大,或者需要在多个请求中重复使用视频的场景。
该方式支持最大 512MB 的文件,默认存储 7 天,存储有效期取值范围为1-30天。
示例中用到的视频文件及提示词如下:
视频文件 | 提示词 |
|---|---|
请你描述下视频中的人物的一系列动作,以JSON格式输出开始时间(start_time)、结束时间(end_time)、事件(event)、是否危险(danger),请使用HH:mm:ss表示时间戳。 |
示例代码:
上传视频文件获取File ID。
curl https://ark.cn-beijing.volces.com/api/v3/files \ -H "Authorization: Bearer $ARK_API_KEY" \ -F "purpose=user_data" \ -F "file=@/Users/doc/demo.mp4" \ -F "preprocess_configs[video][fps]=0.3"
在Responses API中引用File ID。
curl https://ark.cn-beijing.volces.com/api/v3/responses \ -H "Authorization: Bearer $ARK_API_KEY" \ -H 'Content-Type: application/json' \ -d '{ "model": "doubao-seed-1-6-251015", "input": [ { "role": "user", "content": [ { "type": "input_video", "file_id": "file-20251018****" }, { "type": "input_text", "text": "请你描述下视频中的人物的一系列动作,以JSON格式输出开始时间(start_time)、结束时间(end_time)、事件(event)、是否危险(danger),请使用HH:mm:ss表示时间戳。" } ] } ] }'
import asyncio import os from volcenginesdkarkruntime import AsyncArk from volcenginesdkarkruntime.types.responses.response_completed_event import ResponseCompletedEvent from volcenginesdkarkruntime.types.responses.response_reasoning_summary_text_delta_event import ResponseReasoningSummaryTextDeltaEvent from volcenginesdkarkruntime.types.responses.response_output_item_added_event import ResponseOutputItemAddedEvent from volcenginesdkarkruntime.types.responses.response_text_delta_event import ResponseTextDeltaEvent from volcenginesdkarkruntime.types.responses.response_text_done_event import ResponseTextDoneEvent client = AsyncArk( base_url='https://ark.cn-beijing.volces.com/api/v3', api_key=os.getenv('ARK_API_KEY') ) async def main(): # upload video file print("Upload video file") file = await client.files.create( # replace with your local video path file=open("/Users/doc/demo.mp4", "rb"), purpose="user_data", preprocess_configs={ "video": { "fps": 0.3, # define the sampling fps of the video, default is 1.0 } } ) print(f"File uploaded: {file.id}") # Wait for the file to finish processing await client.files.wait_for_processing(file.id) print(f"File processed: {file.id}") stream = await client.responses.create( model="doubao-seed-1-6-251015", input=[ {"role": "user", "content": [ { "type": "input_video", "file_id": file.id # ref video file id }, { "type": "input_text", "text": "请你描述下视频中的人物的一系列动作,以JSON格式输出开始时间(start_time)、结束时间(end_time)、事件(event)、是否危险(danger),请使用HH:mm:ss表示时间戳。" } ]}, ], caching={ "type": "enabled", }, store=True, stream=True ) async for event in stream: if isinstance(event, ResponseReasoningSummaryTextDeltaEvent): print(event.delta, end="") if isinstance(event, ResponseOutputItemAddedEvent): print("\noutPutItem " + event.type + " start:") if isinstance(event, ResponseTextDeltaEvent): print(event.delta,end="") if isinstance(event, ResponseTextDoneEvent): print("\noutPutTextDone.") if isinstance(event, ResponseCompletedEvent): print("Response Completed. Usage = " + event.response.usage.model_dump_json()) if __name__ == "__main__": asyncio.run(main())
package main import ( "context" "fmt" "io" "os" "time" "github.com/volcengine/volcengine-go-sdk/service/arkruntime" "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model/file" "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model/responses" "github.com/volcengine/volcengine-go-sdk/volcengine" ) func main() { client := arkruntime.NewClientWithApiKey( // Get API Key:https://console.volcengine.com/ark/region:ark+cn-beijing/apikey os.Getenv("ARK_API_KEY"), arkruntime.WithBaseUrl("https://ark.cn-beijing.volces.com/api/v3"), ) ctx := context.Background() fmt.Println("----- upload video data -----") data, err := os.Open("/Users/doc/demo.mp4") if err != nil { fmt.Printf("read file error: %v\n", err) return } fileInfo, err := client.UploadFile(ctx, &file.UploadFileRequest{ File: data, Purpose: file.PurposeUserData, PreprocessConfigs: &file.PreprocessConfigs{ Video: &file.Video{ Fps: volcengine.Float64(0.3), }, }, }) if err != nil { fmt.Printf("upload file error: %v", err) return } // Wait for the file to finish processing for fileInfo.Status == file.StatusProcessing { fmt.Println("Waiting for video to be processed...") time.Sleep(2 * time.Second) fileInfo, err = client.RetrieveFile(ctx, fileInfo.ID) // update file info if err != nil { fmt.Printf("get file status error: %v", err) return } } fmt.Printf("Video processing completed: %s, status: %s\n", fileInfo.ID, fileInfo.Status) inputMessage := &responses.ItemInputMessage{ Role: responses.MessageRole_user, Content: []*responses.ContentItem{ { Union: &responses.ContentItem_Video{ Video: &responses.ContentItemVideo{ Type: responses.ContentItemType_input_video, FileId: volcengine.String(fileInfo.ID), }, }, }, { Union: &responses.ContentItem_Text{ Text: &responses.ContentItemText{ Type: responses.ContentItemType_input_text, Text: "请你描述下视频中的人物的一系列动作,以JSON格式输出开始时间(start_time)、结束时间(end_time)、事件(event)、是否危险(danger),请使用HH:mm:ss表示时间戳。", }, }, }, }, } createResponsesReq := &responses.ResponsesRequest{ Model: "doubao-seed-1-6-251015", Input: &responses.ResponsesInput{ Union: &responses.ResponsesInput_ListValue{ ListValue: &responses.InputItemList{ListValue: []*responses.InputItem{{ Union: &responses.InputItem_InputMessage{ InputMessage: inputMessage, }, }}}, }, }, Caching: &responses.ResponsesCaching{Type: responses.CacheType_enabled.Enum()}, } resp, err := client.CreateResponsesStream(ctx, createResponsesReq) if err != nil { fmt.Printf("stream error: %v\n", err) return } var responseId string for { event, err := resp.Recv() if err == io.EOF { break } if err != nil { fmt.Printf("stream error: %v\n", err) return } handleEvent(event) if responseEvent := event.GetResponse(); responseEvent != nil { responseId = responseEvent.GetResponse().GetId() fmt.Printf("Response ID: %s", responseId) } } } func handleEvent(event *responses.Event) { switch event.GetEventType() { case responses.EventType_response_reasoning_summary_text_delta.String(): print(event.GetReasoningText().GetDelta()) case responses.EventType_response_reasoning_summary_text_done.String(): // aggregated reasoning text fmt.Printf("\nAggregated reasoning text: %s\n", event.GetReasoningText().GetText()) case responses.EventType_response_output_text_delta.String(): print(event.GetText().GetDelta()) case responses.EventType_response_output_text_done.String(): // aggregated output text fmt.Printf("\nAggregated output text: %s\n", event.GetTextDone().GetText()) default: return } }
package com.ark.example; import com.volcengine.ark.runtime.model.files.FileMeta; import com.volcengine.ark.runtime.model.files.PreprocessConfigs; import com.volcengine.ark.runtime.model.files.UploadFileRequest; import com.volcengine.ark.runtime.model.files.Video; import com.volcengine.ark.runtime.service.ArkService; import com.volcengine.ark.runtime.model.responses.request.*; import com.volcengine.ark.runtime.model.responses.item.ItemEasyMessage; import com.volcengine.ark.runtime.model.responses.constant.ResponsesConstants; import com.volcengine.ark.runtime.model.responses.item.MessageContent; import com.volcengine.ark.runtime.model.responses.content.InputContentItemVideo; import com.volcengine.ark.runtime.model.responses.content.InputContentItemText; import com.volcengine.ark.runtime.model.responses.event.functioncall.FunctionCallArgumentsDoneEvent; import com.volcengine.ark.runtime.model.responses.event.outputitem.OutputItemAddedEvent; import com.volcengine.ark.runtime.model.responses.event.outputitem.OutputItemDoneEvent; import com.volcengine.ark.runtime.model.responses.event.outputtext.OutputTextDeltaEvent; import com.volcengine.ark.runtime.model.responses.event.outputtext.OutputTextDoneEvent; import com.volcengine.ark.runtime.model.responses.event.reasoningsummary.ReasoningSummaryTextDeltaEvent; import com.volcengine.ark.runtime.model.responses.event.response.ResponseCompletedEvent; import java.io.File; import java.util.concurrent.TimeUnit; public class demo { public static void main(String[] args) { String apiKey = System.getenv("ARK_API_KEY"); ArkService service = ArkService.builder().apiKey(apiKey).baseUrl("https://ark.cn-beijing.volces.com/api/v3").build(); System.out.println("===== Upload File Example====="); // upload a video for responses FileMeta fileMeta; fileMeta = service.uploadFile( UploadFileRequest.builder(). file(new File("/Users/doc/demo.mp4")) // replace with your image file path .purpose("user_data") .preprocessConfigs(PreprocessConfigs.builder().video(new Video(0.3)).build()) .build()); System.out.println("Uploaded file Meta: " + fileMeta); System.out.println("status:" + fileMeta.getStatus()); try { while (fileMeta.getStatus().equals("processing")) { System.out.println("Waiting for video to be processed..."); TimeUnit.SECONDS.sleep(2); fileMeta = service.retrieveFile(fileMeta.getId()); } } catch (Exception e) { System.err.println("get file status error:" + e.getMessage()); } System.out.println("Uploaded file Meta: " + fileMeta); CreateResponsesRequest request = CreateResponsesRequest.builder() .model("doubao-seed-1-6-251015") .stream(true) .input(ResponsesInput.builder().addListItem( ItemEasyMessage.builder().role(ResponsesConstants.MESSAGE_ROLE_USER).content( MessageContent.builder() .addListItem(InputContentItemVideo.builder().fileId(fileMeta.getId()).build()) .addListItem(InputContentItemText.builder().text("请你描述下视频中的人物的一系列动作,以JSON格式输出开始时间(start_time)、结束时间(end_time)、事件(event)、是否危险(danger),请使用HH:mm:ss表示时间戳。").build()) .build() ).build() ).build()) .build(); service.streamResponse(request) .doOnError(Throwable::printStackTrace) .blockingForEach(event -> { if (event instanceof ReasoningSummaryTextDeltaEvent) { System.out.print(((ReasoningSummaryTextDeltaEvent) event).getDelta()); } if (event instanceof OutputItemAddedEvent) { System.out.println("\nOutputItem " + (((OutputItemAddedEvent) event).getItem().getType()) + " Start: "); } if (event instanceof OutputTextDeltaEvent) { System.out.print(((OutputTextDeltaEvent) event).getDelta()); } if (event instanceof OutputTextDoneEvent) { System.out.println("\nOutputText End."); } if (event instanceof OutputItemDoneEvent) { System.out.println("\nOutputItem " + ((OutputItemDoneEvent) event).getItem().getType() + " End."); } if (event instanceof FunctionCallArgumentsDoneEvent) { System.out.println("\nFunctionCall Arguments: " + ((FunctionCallArgumentsDoneEvent) event).getArguments()); } if (event instanceof ResponseCompletedEvent) { System.out.println("\nResponse Completed. Usage = " + ((ResponseCompletedEvent) event).getResponse().getUsage()); } }); service.shutdownExecutor(); } }
import os import time from openai import OpenAI api_key = os.getenv('ARK_API_KEY') client = OpenAI( base_url='https://ark.cn-beijing.volces.com/api/v3', api_key=api_key, ) file = client.files.create( file=open("/Users/doc/demo.mp4", "rb"), purpose="user_data" ) # Wait for the file to finish processing while (file.status == "processing"): time.sleep(2) file = client.files.retrieve(file.id) print(f"File processed: {file}") response = client.responses.create( model="doubao-seed-1-6-251015", input=[ { "role": "user", "content": [ { "type": "input_video", "file_id": file.id, }, { "type": "input_text", "text": "请你描述下视频中的人物的一系列动作,以JSON格式输出开始时间(start_time)、结束时间(end_time)、事件(event)、是否危险(danger),请使用HH:mm:ss表示时间戳。", }, ] } ], stream=True ) for event in response: if event.type == "response.reasoning_summary_text.delta": print(event.delta, end="") if event.type == "response.output_item.added": print("\noutPutItem " + event.type + " start:") if event.type == "response.output_text.delta": print(event.delta,end="") if event.type == "response.output_item.done": print("\noutPutTextDone.") if event.type == "response.completed": print("\nResponse Completed. Usage = " + event.response.usage.model_dump_json())
输出示例:
{ "text": [ { "start_time": "00:00:00", "end_time": "00:00:03", "event": "将黄色积木堆叠到红黄绿相间的积木塔顶端", "danger": false }, { "start_time": "00:00:05", "end_time": "00:00:07", "event": "拿起蓝色积木放入口中啃咬后取出", "danger": false }, { "start_time": "00:00:08", "end_time": "00:00:12", "event": "用手推动黄色和蓝色的玩具卡车在地面移动", "danger": false }, { "start_time": "00:00:12", "end_time": "00:00:24", "event": "双手扶着木质抽屉柜边缘,尝试将右脚踩在下方抽屉把手上向上攀爬", "danger": true } ] }
模型支持处理PDF格式的文档,在预处理时会分页来处理成多图,然后将每页图像输入模型进行处理,以实现对文档内容的理解和分析。 关于文档理解支持的文件输入方式及示例,请参见文档理解。
建议优先选择 Files API 上传的方式,适用于文件较大,或者需要在多个请求中重复使用文件的场景。
该方式支持最大 512MB 的文件,默认存储 7 天,存储有效期取值范围为1-30天。
示例中用到的PDF文件及提示词如下:
PDF文件 | 提示词 |
|---|---|
按段落给出文档中的文字内容,以JSON格式输出,包括段落类型(type)、文字内容(content)信息。 |
代码示例:
上传PDF文件获取File ID。
curl https://ark.cn-beijing.volces.com/api/v3/files \ -H "Authorization: Bearer $ARK_API_KEY" \ -F "purpose=user_data" \ -F "file=@/Users/doc/demo.pdf"
在Responses API中引用File ID。
curl https://ark.cn-beijing.volces.com/api/v3/responses \ -H "Authorization: Bearer $ARK_API_KEY" \ -H 'Content-Type: application/json' \ -d '{ "model": "doubao-seed-1-6-251015", "input": [ { "role": "user", "content": [ { "type": "input_file", "file_id": "file-20251018****" }, { "type": "input_text", "text": "按段落给出文档中的文字内容,以JSON格式输出,包括段落类型(type)、文字内容(content)信息。" } ] } ] }'
import asyncio import os from volcenginesdkarkruntime import AsyncArk from volcenginesdkarkruntime.types.responses.response_completed_event import ResponseCompletedEvent from volcenginesdkarkruntime.types.responses.response_reasoning_summary_text_delta_event import ResponseReasoningSummaryTextDeltaEvent from volcenginesdkarkruntime.types.responses.response_output_item_added_event import ResponseOutputItemAddedEvent from volcenginesdkarkruntime.types.responses.response_text_delta_event import ResponseTextDeltaEvent from volcenginesdkarkruntime.types.responses.response_text_done_event import ResponseTextDoneEvent client = AsyncArk( base_url='https://ark.cn-beijing.volces.com/api/v3', api_key=os.getenv('ARK_API_KEY') ) async def main(): # upload pdf file print("Upload pdf file") file = await client.files.create( # replace with your local pdf path file=open("/Users/doc/demo.pdf", "rb"), purpose="user_data" ) print(f"File uploaded: {file.id}") # Wait for the file to finish processing await client.files.wait_for_processing(file.id) print(f"File processed: {file.id}") stream = await client.responses.create( model="doubao-seed-1-6-251015", input=[ {"role": "user", "content": [ { "type": "input_file", "file_id": file.id # ref pdf file id }, { "type": "input_text", "text": "按段落给出文档中的文字内容,以JSON格式输出,包括段落类型(type)、文字内容(content)信息。" } ]}, ], caching={ "type": "enabled", }, store=True, stream=True ) async for event in stream: if isinstance(event, ResponseReasoningSummaryTextDeltaEvent): print(event.delta, end="") if isinstance(event, ResponseOutputItemAddedEvent): print("\noutPutItem " + event.type + " start:") if isinstance(event, ResponseTextDeltaEvent): print(event.delta,end="") if isinstance(event, ResponseTextDoneEvent): print("\noutPutTextDone.") if isinstance(event, ResponseCompletedEvent): print("Response Completed. Usage = " + event.response.usage.model_dump_json()) if __name__ == "__main__": asyncio.run(main())
package main import ( "context" "fmt" "io" "os" "time" "github.com/volcengine/volcengine-go-sdk/service/arkruntime" "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model/file" "github.com/volcengine/volcengine-go-sdk/service/arkruntime/model/responses" "github.com/volcengine/volcengine-go-sdk/volcengine" ) func main() { client := arkruntime.NewClientWithApiKey(os.Getenv("ARK_API_KEY")) ctx := context.Background() fmt.Println("----- upload file data -----") data, err := os.Open("/Users/doc/demo.pdf") if err != nil { fmt.Printf("read file error: %v\n", err) return } fileInfo, err := client.UploadFile(ctx, &file.UploadFileRequest{ File: data, Purpose: file.PurposeUserData, }) if err != nil { fmt.Printf("upload file error: %v", err) return } // Wait for the file to finish processing for fileInfo.Status == file.StatusProcessing { fmt.Println("Waiting for file to be processed...") time.Sleep(2 * time.Second) fileInfo, err = client.RetrieveFile(ctx, fileInfo.ID) // update file info if err != nil { fmt.Printf("get file status error: %v", err) return } } fmt.Printf("Video processing completed: %s, status: %s\n", fileInfo.ID, fileInfo.Status) inputMessage := &responses.ItemInputMessage{ Role: responses.MessageRole_user, Content: []*responses.ContentItem{ { Union: &responses.ContentItem_File{ File: &responses.ContentItemFile{ Type: responses.ContentItemType_input_file, FileId: volcengine.String(fileInfo.ID), }, }, }, { Union: &responses.ContentItem_Text{ Text: &responses.ContentItemText{ Type: responses.ContentItemType_input_text, Text: "按段落给出文档中的文字内容,以JSON格式输出,包括段落类型(type)、文字内容(content)信息。", }, }, }, }, } createResponsesReq := &responses.ResponsesRequest{ Model: "doubao-seed-1-6-251015", Input: &responses.ResponsesInput{ Union: &responses.ResponsesInput_ListValue{ ListValue: &responses.InputItemList{ListValue: []*responses.InputItem{{ Union: &responses.InputItem_InputMessage{ InputMessage: inputMessage, }, }}}, }, }, Caching: &responses.ResponsesCaching{Type: responses.CacheType_enabled.Enum()}, } resp, err := client.CreateResponsesStream(ctx, createResponsesReq) if err != nil { fmt.Printf("stream error: %v\n", err) return } var responseId string for { event, err := resp.Recv() if err == io.EOF { break } if err != nil { fmt.Printf("stream error: %v\n", err) return } handleEvent(event) if responseEvent := event.GetResponse(); responseEvent != nil { responseId = responseEvent.GetResponse().GetId() fmt.Printf("Response ID: %s", responseId) } } } func handleEvent(event *responses.Event) { switch event.GetEventType() { case responses.EventType_response_reasoning_summary_text_delta.String(): print(event.GetReasoningText().GetDelta()) case responses.EventType_response_reasoning_summary_text_done.String(): // aggregated reasoning text fmt.Printf("\nAggregated reasoning text: %s\n", event.GetReasoningText().GetText()) case responses.EventType_response_output_text_delta.String(): print(event.GetText().GetDelta()) case responses.EventType_response_output_text_done.String(): // aggregated output text fmt.Printf("\nAggregated output text: %s\n", event.GetTextDone().GetText()) default: return } }
package com.ark.example; import com.volcengine.ark.runtime.model.files.FileMeta; import com.volcengine.ark.runtime.model.files.UploadFileRequest; import com.volcengine.ark.runtime.model.responses.content.InputContentItemFile; import com.volcengine.ark.runtime.service.ArkService; import com.volcengine.ark.runtime.model.responses.request.*; import com.volcengine.ark.runtime.model.responses.item.ItemEasyMessage; import com.volcengine.ark.runtime.model.responses.constant.ResponsesConstants; import com.volcengine.ark.runtime.model.responses.item.MessageContent; import com.volcengine.ark.runtime.model.responses.content.InputContentItemText; import com.volcengine.ark.runtime.model.responses.event.functioncall.FunctionCallArgumentsDoneEvent; import com.volcengine.ark.runtime.model.responses.event.outputitem.OutputItemAddedEvent; import com.volcengine.ark.runtime.model.responses.event.outputitem.OutputItemDoneEvent; import com.volcengine.ark.runtime.model.responses.event.outputtext.OutputTextDeltaEvent; import com.volcengine.ark.runtime.model.responses.event.outputtext.OutputTextDoneEvent; import com.volcengine.ark.runtime.model.responses.event.reasoningsummary.ReasoningSummaryTextDeltaEvent; import com.volcengine.ark.runtime.model.responses.event.response.ResponseCompletedEvent; import java.io.File; import java.util.concurrent.TimeUnit; public class demo { public static void main(String[] args) { String apiKey = System.getenv("ARK_API_KEY"); ArkService service = ArkService.builder().apiKey(apiKey).build(); System.out.println("===== Upload File Example====="); // upload a file for responses FileMeta fileMeta; fileMeta = service.uploadFile( UploadFileRequest.builder(). file(new File("/Users/doc/demo.pdf")) // replace with your file file path .purpose("user_data") .build()); System.out.println("Uploaded file Meta: " + fileMeta); System.out.println("status:" + fileMeta.getStatus()); try { while (fileMeta.getStatus().equals("processing")) { System.out.println("Waiting for file to be processed..."); TimeUnit.SECONDS.sleep(2); fileMeta = service.retrieveFile(fileMeta.getId()); } } catch (Exception e) { System.err.println("get file status error:" + e.getMessage()); } System.out.println("Uploaded file Meta: " + fileMeta); CreateResponsesRequest request = CreateResponsesRequest.builder() .model("doubao-seed-1-6-251015") .stream(true) .input(ResponsesInput.builder().addListItem( ItemEasyMessage.builder().role(ResponsesConstants.MESSAGE_ROLE_USER).content( MessageContent.builder() .addListItem(InputContentItemFile.InputContentItemFileBuilder.anInputContentItemFile().fileId(fileMeta.getId()).build()) .addListItem(InputContentItemText.builder().text("按段落给出文档中的文字内容,以JSON格式输出,包括段落类型(type)、文字内容(content)信息。").build()) .build() ).build() ).build()) .build(); service.streamResponse(request) .doOnError(Throwable::printStackTrace) .blockingForEach(event -> { if (event instanceof ReasoningSummaryTextDeltaEvent) { System.out.print(((ReasoningSummaryTextDeltaEvent) event).getDelta()); } if (event instanceof OutputItemAddedEvent) { System.out.println("\nOutputItem " + (((OutputItemAddedEvent) event).getItem().getType()) + " Start: "); } if (event instanceof OutputTextDeltaEvent) { System.out.print(((OutputTextDeltaEvent) event).getDelta()); } if (event instanceof OutputTextDoneEvent) { System.out.println("\nOutputText End."); } if (event instanceof OutputItemDoneEvent) { System.out.println("\nOutputItem " + ((OutputItemDoneEvent) event).getItem().getType() + " End."); } if (event instanceof FunctionCallArgumentsDoneEvent) { System.out.println("\nFunctionCall Arguments: " + ((FunctionCallArgumentsDoneEvent) event).getArguments()); } if (event instanceof ResponseCompletedEvent) { System.out.println("\nResponse Completed. Usage = " + ((ResponseCompletedEvent) event).getResponse().getUsage()); } }); service.shutdownExecutor(); } }
import os import time from openai import OpenAI api_key = os.getenv('ARK_API_KEY') client = OpenAI( base_url='https://ark.cn-beijing.volces.com/api/v3', api_key=api_key, ) file = client.files.create( file=open("/Users/doc/demo.pdf", "rb"), purpose="user_data" ) # Wait for the file to finish processing while (file.status == "processing"): time.sleep(2) file = client.files.retrieve(file.id) print(f"File processed: {file}") response = client.responses.create( model="doubao-seed-1-6-251015", input=[ { "role": "user", "content": [ { "type": "input_file", "file_id": file.id, }, { "type": "input_text", "text": "按段落给出文档中的文字内容,以JSON格式输出,包括段落类型(type)、文字内容(content)信息。", }, ] } ], stream=True ) for event in response: if event.type == "response.reasoning_summary_text.delta": print(event.delta, end="") if event.type == "response.output_item.added": print(" outPutItem " + event.type + " start:") if event.type == "response.output_text.delta": print(event.delta,end="") if event.type == "response.output_item.done": print(" outPutTextDone.") if event.type == "response.completed": print(" Response Completed. Usage = " + event.response.usage.model_dump_json())
输出示例:
{ "text": [ { "type": "heading", "content": "1 Introduction" }, { "type": "paragraph", "content": "Diffusion models [3–5] learn to reverse a process that incrementally corrupts data with noise, effectively decomposing a complex distribution into a hierarchy of simplified representations. This coarse-to-fine generative approach has proven remarkably successful across a wide range of applications, including image and video synthesis [6] as well as solving complex challenges in natural sciences [7]." }, ... { "type": "heading", "content": "3 Seed Diffusion" }, { "type": "paragraph", "content": "As the first experimental model in our Seed Diffusion series, Seed Diffusion Preview is specifically focused on code generation, thus adopting the data pipeline (code/code-related data only) and processing methodology of the open-sourced Seed Coder project [20]. The architecture is a standard dense Transformer, and we intentionally omit complex components such as LongCoT reasoning in this initial version to first establish a strong and efficient performance baseline. This section introduces its key components and training strategies." } ] }