Skip to content

Commit

Permalink
feat: add azure speech support
Browse files Browse the repository at this point in the history
  • Loading branch information
ymshenyu committed Dec 5, 2024
1 parent 376726c commit 28f7f6f
Show file tree
Hide file tree
Showing 10 changed files with 11,290 additions and 11,059 deletions.
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 8 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/azurespeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from typing import Any

from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController


class AzureSpeechProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
pass
51 changes: 51 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/azurespeech.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
identity:
author: ymshenyu
name: azurespeech
label:
en_US: Azure Speech
zh_Hans: Azure Speech
description:
en_US: Azure Speech
zh_Hans: Azure Speech
icon: icon.png
tags:
- utilities
credentials_for_provider:
azure_speech_api_key:
type: secret-input
required: true
label:
en_US: API key
zh_Hans: 密钥
help:
en_US: Please input your Azure Speech API key
zh_Hans: 请输入你的 Azure Speech API key
placeholder:
en_US: Please input your Azure Speech API key
zh_Hans: 请输入你的 Azure Speech API key
azure_speech_region:
type: text-input
required: true
label:
en_US: API Region
zh_Hans: API 地区
help:
en_US: Please input your Azure Speech Region, e.g. eastasia
zh_Hans: 请输入你的 Azure Speech 地区,例如:eastasia
placeholder:
en_US: Please input your Azure Speech Region, e.g. eastasia
zh_Hans: 请输入你的 Azure Speech 地区,例如:eastasia
azure_speech_api_version:
type: text-input
required: false
default: '2024-11-15'
label:
en_US: API Version
zh_Hans: API Version
pt_BR: Versão da API
placeholder:
en_US: Please input your Azure Speech API Version
zh_Hans: 请输入你的 Azure Speech API Version
help:
en_US: Get your API Version from Azure Speech
zh_Hans: 从 Azure Speech 获取您的 API Version
31 changes: 31 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Any

import requests

from core.file.enums import FileType
from core.file.file_manager import download
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool


class AzureASRTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]:
file = tool_parameters.get("audio_file")
if file.type != FileType.AUDIO:
return [self.create_text_message("not a valid audio file")]
audio_binary = download(file)

resp = requests.post(
"https://{}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version={}".format(
self.runtime.credentials.get("azure_speech_region"),
self.runtime.credentials.get("azure_speech_api_version"),
),
headers={
"Ocp-Apim-Subscription-Key": self.runtime.credentials.get("azure_speech_api_key"),
},
files={"audio": audio_binary},
)

data: dict = resp.json()

return [self.create_text_message(data.get("combinedPhrases", [])[0].get("text", ""))]
22 changes: 22 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/asr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
identity:
name: azure_asr
author: ymshenyu
label:
en_US: Azure Speech To Text
description:
human:
en_US: Convert audio file to text.
zh_Hans: 将音频文件转换为文本。
llm: Convert audio file to text.
parameters:
- name: audio_file
type: file
required: true
label:
en_US: Audio File
zh_Hans: 音频文件
human_description:
en_US: The audio file to be converted.
zh_Hans: 要转换的音频文件。
llm_description: The audio file to be converted.
form: llm
55 changes: 55 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import io
from typing import Any

import azure.cognitiveservices.speech as speechsdk

from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool


class AzureTTSTool(BuiltinTool):
def _invoke(
self,
user_id: str,
tool_parameters: dict[str, Any],
) -> list[ToolInvokeMessage]:
speech_config = speechsdk.SpeechConfig(
subscription=self.runtime.credentials["azure_speech_api_key"],
region=self.runtime.credentials["azure_speech_region"],
)
speech_config.speech_synthesis_voice_name = tool_parameters.get(
"speech_synthesis_voice_name", "en-US-AvaMultilingualNeural"
)
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3
)

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)

text: str = tool_parameters.get("text", "")
speech_synthesis_result = speech_synthesizer.speak_text(text=text)

if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
stream = speechsdk.AudioDataStream(speech_synthesis_result)
audio_data = io.BytesIO()
buffer = bytes(1024)
filled_size = stream.read_data(buffer)
while filled_size > 0:
audio_data.write(buffer[:filled_size])
filled_size = stream.read_data(buffer)
return [
self.create_text_message("Audio generated successfully"),
self.create_blob_message(
blob=audio_data.getvalue(),
meta={"mime_type": "audio/mpeg"},
save_as=self.VariableKey.AUDIO,
),
]
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
msg = "Speech synthesis canceled: {}\n".format(cancellation_details.reason)
if cancellation_details.reason == speechsdk.CancellationReason.Error:
if cancellation_details.error_details:
msg += "Error details: {}".format(cancellation_details.error_details)
return [self.create_text_message(msg)]
return [self.create_text_message("Audio generation failed")]
33 changes: 33 additions & 0 deletions api/core/tools/provider/builtin/azurespeech/tools/tts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
identity:
name: azure_tts
author: ymshenyu
label:
en_US: Azure Text To Speech
description:
human:
en_US: Convert text to audio file.
zh_Hans: 将文本转换为音频文件。
llm: Convert text to audio file.
parameters:
- name: text
type: string
required: true
label:
en_US: Text
zh_Hans: 文本
human_description:
en_US: The text to be converted.
zh_Hans: 要转换的文本。
llm_description: The text to be converted.
form: llm
- name: speech_synthesis_voice_name
type: string
required: true
label:
en_US: Voice Name
zh_Hans: 音色
human_description:
en_US: The voice name to be used.
zh_Hans: 要使用的音色
llm_description: The voice name to be used.
form: form
Loading

0 comments on commit 28f7f6f

Please sign in to comment.