Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for Azure speech #11375

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 23 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/azurespeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Any

from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.azurespeech.tools.tts import AzureTTSTool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController


class AzureSpeechProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
try:
AzureTTSTool().fork_tool_runtime(
runtime={
"credentials": credentials,
}
).invoke(
user_id="",
tool_parameters={
"text": "This is a test text",
"speech_synthesis_voice_name": "en-US-AvaMultilingualNeural",
},
)
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))
51 changes: 51 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/azurespeech.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
identity:
author: ymshenyu
name: azurespeech
label:
en_US: Azure Speech
zh_Hans: Azure Speech
description:
en_US: Azure Speech
zh_Hans: Azure Speech
icon: icon.png
tags:
- utilities
credentials_for_provider:
azure_speech_api_key:
type: secret-input
required: true
label:
en_US: API key
zh_Hans: 密钥
help:
en_US: Please input your Azure Speech API key
zh_Hans: 请输入你的 Azure Speech API key
placeholder:
en_US: Please input your Azure Speech API key
zh_Hans: 请输入你的 Azure Speech API key
azure_speech_region:
type: text-input
required: true
label:
en_US: API Region
zh_Hans: API 地区
help:
en_US: Please input your Azure Speech Region, e.g. eastasia
zh_Hans: 请输入你的 Azure Speech 地区,例如:eastasia
placeholder:
en_US: Please input your Azure Speech Region, e.g. eastasia
zh_Hans: 请输入你的 Azure Speech 地区,例如:eastasia
azure_speech_api_version:
type: text-input
required: false
default: '2024-11-15'
label:
en_US: API Version
zh_Hans: API Version
pt_BR: Versão da API
placeholder:
en_US: Please input your Azure Speech API Version
zh_Hans: 请输入你的 Azure Speech API Version
help:
en_US: Get your API Version from Azure Speech
zh_Hans: 从 Azure Speech 获取您的 API Version
48 changes: 48 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import json
from typing import Any

import requests

from core.file.enums import FileType
from core.file.file_manager import download
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool


class AzureASRTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]:
file = tool_parameters.get("audio_file")
if file.type != FileType.AUDIO:
return [self.create_text_message("not a valid audio file")]
audio_binary = download(file)
definition: str = tool_parameters.get("definition", "")

files: dict = {"audio": audio_binary}
if definition:
files["definition"] = (None, definition, "application/json")
resp = requests.post(
"https://{}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version={}".format(
self.runtime.credentials.get("azure_speech_region"),
self.runtime.credentials.get("azure_speech_api_version"),
),
headers={
"Ocp-Apim-Subscription-Key": self.runtime.credentials.get("azure_speech_api_key"),
},
files=files,
)

data: dict = resp.json()

combinedPhrases = data.get("combinedPhrases", [])

if len(combinedPhrases) == 0:
raise Exception(
"""No text detected.
Error: {}
Definition: {}""".format(json.dumps(data), definition)
)

return [
self.create_text_message(combinedPhrases[0].get("text", "")),
self.create_json_message(data),
]
33 changes: 33 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/asr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
identity:
name: azure_asr
author: ymshenyu
label:
en_US: Azure Speech To Text
description:
human:
en_US: Convert audio file to text.
zh_Hans: 将音频文件转换为文本。
llm: Convert audio file to text.
parameters:
- name: audio_file
type: file
required: true
label:
en_US: Audio File
zh_Hans: 音频文件
human_description:
en_US: The audio file to be converted.
zh_Hans: 要转换的音频文件。
llm_description: The audio file to be converted.
form: llm
- name: definition
type: string
required: false
label:
en_US: Definition
zh_Hans: asr 详细配置
human_description:
en_US: Definition
zh_Hans: asr 详细配置
llm_description: Definition
form: form
55 changes: 55 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import io
from typing import Any

import azure.cognitiveservices.speech as speechsdk

from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool


class AzureTTSTool(BuiltinTool):
def _invoke(
self,
user_id: str,
tool_parameters: dict[str, Any],
) -> list[ToolInvokeMessage]:
speech_config = speechsdk.SpeechConfig(
subscription=self.runtime.credentials["azure_speech_api_key"],
region=self.runtime.credentials["azure_speech_region"],
)
speech_config.speech_synthesis_voice_name = tool_parameters.get(
"speech_synthesis_voice_name", "en-US-AvaMultilingualNeural"
)
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3
)

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)

text: str = tool_parameters.get("text", "")
speech_synthesis_result = speech_synthesizer.speak_text(text=text)

if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
stream = speechsdk.AudioDataStream(speech_synthesis_result)
audio_data = io.BytesIO()
buffer = bytes(1024)
filled_size = stream.read_data(buffer)
while filled_size > 0:
audio_data.write(buffer[:filled_size])
filled_size = stream.read_data(buffer)
return [
self.create_text_message("Audio generated successfully"),
self.create_blob_message(
blob=audio_data.getvalue(),
meta={"mime_type": "audio/mpeg"},
save_as=self.VariableKey.AUDIO,
),
]
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
msg = "Speech synthesis canceled: {}\n".format(cancellation_details.reason)
if cancellation_details.reason == speechsdk.CancellationReason.Error:
if cancellation_details.error_details:
msg += "Error details: {}".format(cancellation_details.error_details)
raise Exception(msg)
return [self.create_text_message("Audio generation failed")]
33 changes: 33 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/tts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
identity:
name: azure_tts
author: ymshenyu
label:
en_US: Azure Text To Speech
description:
human:
en_US: Convert text to audio file.
zh_Hans: 将文本转换为音频文件。
llm: Convert text to audio file.
parameters:
- name: text
type: string
required: true
label:
en_US: Text
zh_Hans: 文本
human_description:
en_US: The text to be converted.
zh_Hans: 要转换的文本。
llm_description: The text to be converted.
form: llm
- name: speech_synthesis_voice_name
type: string
required: true
label:
en_US: Voice Name
zh_Hans: 音色
human_description:
en_US: The voice name to be used.
zh_Hans: 要使用的音色
llm_description: The voice name to be used.
form: form
Loading