feat: add azure speech support

langgenius · Dec 5, 2024 · c3edde3 · c3edde3
1 parent 6180762
commit c3edde3
Show file tree

Hide file tree

Showing 10 changed files with 11,290 additions and 11,059 deletions.
diff --git a/api/core/tools/provider/builtin/azurespeech/__init__.py b/api/core/tools/provider/builtin/azurespeech/__init__.py
diff --git a/api/core/tools/provider/builtin/azurespeech/_assets/icon.png b/api/core/tools/provider/builtin/azurespeech/_assets/icon.png
diff --git a/api/core/tools/provider/builtin/azurespeech/azurespeech.py b/api/core/tools/provider/builtin/azurespeech/azurespeech.py
@@ -0,0 +1,8 @@
+from typing import Any
+
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class AzureSpeechProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict[str, Any]) -> None:
+        pass
diff --git a/api/core/tools/provider/builtin/azurespeech/azurespeech.yaml b/api/core/tools/provider/builtin/azurespeech/azurespeech.yaml
@@ -0,0 +1,51 @@
+identity:
+  author: ymshenyu
+  name: azurespeech
+  label:
+    en_US: Azure Speech
+    zh_Hans: Azure Speech
+  description:
+    en_US: Azure Speech
+    zh_Hans: Azure Speech
+  icon: icon.png
+  tags:
+    - utilities
+credentials_for_provider:
+  azure_speech_api_key:
+    type: secret-input
+    required: true
+    label:
+      en_US: API key
+      zh_Hans: 密钥
+    help:
+      en_US: Please input your Azure Speech API key
+      zh_Hans: 请输入你的 Azure Speech API key
+    placeholder:
+      en_US: Please input your Azure Speech API key
+      zh_Hans: 请输入你的 Azure Speech API key
+  azure_speech_region:
+    type: text-input
+    required: true
+    label:
+      en_US: API Region
+      zh_Hans: API 地区
+    help:
+      en_US: Please input your Azure Speech Region, e.g. eastasia
+      zh_Hans: 请输入你的 Azure Speech 地区，例如：eastasia
+    placeholder:
+      en_US: Please input your Azure Speech Region, e.g. eastasia
+      zh_Hans: 请输入你的 Azure Speech 地区，例如：eastasia
+  azure_speech_api_version:
+    type: text-input
+    required: false
+    default: '2024-11-15'
+    label:
+      en_US: API Version
+      zh_Hans: API Version
+      pt_BR: Versão da API
+    placeholder:
+      en_US: Please input your Azure Speech API Version
+      zh_Hans: 请输入你的 Azure Speech API Version
+    help:
+      en_US: Get your API Version from Azure Speech
+      zh_Hans: 从 Azure Speech 获取您的 API Version
diff --git a/api/core/tools/provider/builtin/azurespeech/tools/asr.py b/api/core/tools/provider/builtin/azurespeech/tools/asr.py
@@ -0,0 +1,31 @@
+from typing import Any
+
+import requests
+
+from core.file.enums import FileType
+from core.file.file_manager import download
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class AzureASRTool(BuiltinTool):
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]:
+        file = tool_parameters.get("audio_file")
+        if file.type != FileType.AUDIO:
+            return [self.create_text_message("not a valid audio file")]
+        audio_binary = download(file)
+
+        resp = requests.post(
+            "https://{}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version={}".format(
+                self.runtime.credentials.get("azure_speech_region"),
+                self.runtime.credentials.get("azure_speech_api_version"),
+            ),
+            headers={
+                "Ocp-Apim-Subscription-Key": self.runtime.credentials.get("azure_speech_api_key"),
+            },
+            files={"audio": audio_binary},
+        )
+
+        data: dict = resp.json()
+
+        return [self.create_text_message(data.get("combinedPhrases", [])[0].get("text", ""))]
diff --git a/api/core/tools/provider/builtin/azurespeech/tools/asr.yaml b/api/core/tools/provider/builtin/azurespeech/tools/asr.yaml
@@ -0,0 +1,22 @@
+identity:
+  name: azure_asr
+  author: ymshenyu
+  label:
+    en_US: Azure Speech To Text
+description:
+  human:
+    en_US: Convert audio file to text.
+    zh_Hans: 将音频文件转换为文本。
+  llm: Convert audio file to text.
+parameters:
+  - name: audio_file
+    type: file
+    required: true
+    label:
+      en_US: Audio File
+      zh_Hans: 音频文件
+    human_description:
+      en_US: The audio file to be converted.
+      zh_Hans: 要转换的音频文件。
+    llm_description: The audio file to be converted.
+    form: llm
diff --git a/api/core/tools/provider/builtin/azurespeech/tools/tts.py b/api/core/tools/provider/builtin/azurespeech/tools/tts.py
@@ -0,0 +1,55 @@
+import io
+from typing import Any
+
+import azure.cognitiveservices.speech as speechsdk
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class AzureTTSTool(BuiltinTool):
+    def _invoke(
+        self,
+        user_id: str,
+        tool_parameters: dict[str, Any],
+    ) -> list[ToolInvokeMessage]:
+        speech_config = speechsdk.SpeechConfig(
+            subscription=self.runtime.credentials["azure_speech_api_key"],
+            region=self.runtime.credentials["azure_speech_region"],
+        )
+        speech_config.speech_synthesis_voice_name = tool_parameters.get(
+            "speech_synthesis_voice_name", "en-US-AvaMultilingualNeural"
+        )
+        speech_config.set_speech_synthesis_output_format(
+            speechsdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3
+        )
+
+        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
+
+        text: str = tool_parameters.get("text", "")
+        speech_synthesis_result = speech_synthesizer.speak_text(text=text)
+
+        if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+            stream = speechsdk.AudioDataStream(speech_synthesis_result)
+            audio_data = io.BytesIO()
+            buffer = bytes(1024)
+            filled_size = stream.read_data(buffer)
+            while filled_size > 0:
+                audio_data.write(buffer[:filled_size])
+                filled_size = stream.read_data(buffer)
+            return [
+                self.create_text_message("Audio generated successfully"),
+                self.create_blob_message(
+                    blob=audio_data.getvalue(),
+                    meta={"mime_type": "audio/mpeg"},
+                    save_as=self.VariableKey.AUDIO,
+                ),
+            ]
+        elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
+            cancellation_details = speech_synthesis_result.cancellation_details
+            msg = "Speech synthesis canceled: {}\n".format(cancellation_details.reason)
+            if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                if cancellation_details.error_details:
+                    msg += "Error details: {}".format(cancellation_details.error_details)
+            return [self.create_text_message(msg)]
+        return [self.create_text_message("Audio generation failed")]
diff --git a/api/core/tools/provider/builtin/azurespeech/tools/tts.yaml b/api/core/tools/provider/builtin/azurespeech/tools/tts.yaml
@@ -0,0 +1,33 @@
+identity:
+  name: azure_tts
+  author: ymshenyu
+  label:
+    en_US: Azure Text To Speech
+description:
+  human:
+    en_US: Convert text to audio file.
+    zh_Hans: 将文本转换为音频文件。
+  llm: Convert text to audio file.
+parameters:
+  - name: text
+    type: string
+    required: true
+    label:
+      en_US: Text
+      zh_Hans: 文本
+    human_description:
+      en_US: The text to be converted.
+      zh_Hans: 要转换的文本。
+    llm_description: The text to be converted.
+    form: llm
+  - name: speech_synthesis_voice_name
+    type: string
+    required: true
+    label:
+      en_US: Voice Name
+      zh_Hans: 音色
+    human_description:
+      en_US: The voice name to be used.
+      zh_Hans: 要使用的音色
+    llm_description: The voice name to be used.
+    form: form