diff --git a/api/core/tools/provider/builtin/azurespeech/tools/asr.py b/api/core/tools/provider/builtin/azurespeech/tools/asr.py index abd03491829ded..9b9037680b6545 100644 --- a/api/core/tools/provider/builtin/azurespeech/tools/asr.py +++ b/api/core/tools/provider/builtin/azurespeech/tools/asr.py @@ -15,7 +15,11 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInv if file.type != FileType.AUDIO: return [self.create_text_message("not a valid audio file")] audio_binary = download(file) + definition: str = tool_parameters.get("definition", "") + files: dict = {"audio": audio_binary} + if definition: + files["definition"] = definition resp = requests.post( "https://{}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version={}".format( self.runtime.credentials.get("azure_speech_region"), @@ -24,7 +28,7 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInv headers={ "Ocp-Apim-Subscription-Key": self.runtime.credentials.get("azure_speech_api_key"), }, - files={"audio": audio_binary}, + files=files, ) data: dict = resp.json() @@ -34,4 +38,7 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInv if len(combinedPhrases) == 0: raise Exception("No text detected, error: {}".format(json.dumps(data))) - return [self.create_text_message(data.get("combinedPhrases", [])[0].get("text", ""))] + return [ + self.create_text_message(data.get("combinedPhrases", [])[0].get("text", "")), + self.create_json_message(data), + ] diff --git a/api/core/tools/provider/builtin/azurespeech/tools/asr.yaml b/api/core/tools/provider/builtin/azurespeech/tools/asr.yaml index 13c05a4b933b9c..5829b47b37af8c 100644 --- a/api/core/tools/provider/builtin/azurespeech/tools/asr.yaml +++ b/api/core/tools/provider/builtin/azurespeech/tools/asr.yaml @@ -20,3 +20,14 @@ parameters: zh_Hans: 要转换的音频文件。 llm_description: The audio file to be converted. form: llm + - name: definition + type: string + required: false + label: + en_US: Definition + zh_Hans: asr 详细配置 + human_description: + en_US: Definition + zh_Hans: asr 详细配置 + llm_description: Definition + form: form