feat(podcast_generator): add new podcast generation tools

- Introduced podcast generator with text-to-speech functionality using OpenAI's API. - Implemented credential validation for TTS services and API keys. - Added support for generating podcast audio with alternating host voices. - Included user-friendly setup with internationalized YAML configuration. - Added SVG icon to enhance visual identification.
langgenius · Oct 14, 2024 · e442d8e · e442d8e
1 parent 2846277
commit e442d8e
Show file tree

Hide file tree

Showing 5 changed files with 287 additions and 0 deletions.
diff --git a/api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg b/api/core/tools/provider/builtin/podcast_generator/_assets/icon.svg
diff --git a/api/core/tools/provider/builtin/podcast_generator/podcast_generator.py b/api/core/tools/provider/builtin/podcast_generator/podcast_generator.py
@@ -0,0 +1,33 @@
+from typing import Any
+
+import openai
+
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class PodcastGeneratorProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict[str, Any]) -> None:
+        tts_service = credentials.get("tts_service")
+        api_key = credentials.get("api_key")
+
+        if not tts_service:
+            raise ToolProviderCredentialValidationError("TTS service is not specified")
+
+        if not api_key:
+            raise ToolProviderCredentialValidationError("API key is missing")
+
+        if tts_service == "openai":
+            self._validate_openai_credentials(api_key)
+        else:
+            raise ToolProviderCredentialValidationError(f"Unsupported TTS service: {tts_service}")
+
+    def _validate_openai_credentials(self, api_key: str) -> None:
+        client = openai.OpenAI(api_key=api_key)
+        try:
+            # We're using a simple API call to validate the credentials
+            client.models.list()
+        except openai.AuthenticationError:
+            raise ToolProviderCredentialValidationError("Invalid OpenAI API key")
+        except Exception as e:
+            raise ToolProviderCredentialValidationError(f"Error validating OpenAI API key: {str(e)}")
diff --git a/api/core/tools/provider/builtin/podcast_generator/podcast_generator.yaml b/api/core/tools/provider/builtin/podcast_generator/podcast_generator.yaml
@@ -0,0 +1,34 @@
+identity:
+  author: Dify
+  name: podcast_generator
+  label:
+    en_US: Podcast Generator
+    zh_Hans: 播客生成器
+  description:
+    en_US: Generate podcast audio using Text-to-Speech services
+    zh_Hans: 使用文字转语音服务生成播客音频
+  icon: icon.svg
+credentials_for_provider:
+  tts_service:
+    type: select
+    required: true
+    label:
+      en_US: TTS Service
+      zh_Hans: TTS 服务
+    placeholder:
+      en_US: Select a TTS service
+      zh_Hans: 选择一个 TTS 服务
+    options:
+      - label:
+          en_US: OpenAI TTS
+          zh_Hans: OpenAI TTS
+        value: openai
+  api_key:
+    type: secret-input
+    required: true
+    label:
+      en_US: API Key
+      zh_Hans: API 密钥
+    placeholder:
+      en_US: Enter your TTS service API key
+      zh_Hans: 输入您的 TTS 服务 API 密钥
diff --git a/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.py b/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.py
@@ -0,0 +1,101 @@
+import concurrent.futures
+import random
+import struct
+from typing import Any, Literal, Optional, Union
+
+import openai
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.errors import ToolParameterValidationError, ToolProviderCredentialValidationError
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class PodcastAudioGeneratorTool(BuiltinTool):
+    @staticmethod
+    def _generate_silence(duration):
+        # Generate silent MP3 data
+        # This is a simplified version and may not work perfectly with all MP3 players
+        # For production use, consider using a proper audio library or pre-generated silence MP3
+        sample_rate = 44100
+        num_samples = int(duration * sample_rate)
+        silence_data = struct.pack("<" + "h" * num_samples, *([0] * num_samples))
+
+        # Add a simple MP3 header (this is not a complete MP3 file, but might work for basic needs)
+        mp3_header = b"\xff\xfb\x90\x04"  # A very basic MP3 header
+        return mp3_header + silence_data
+
+    @staticmethod
+    def _generate_audio_segment(
+        client: openai.OpenAI,
+        line: str,
+        voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
+        index: int,
+    ) -> tuple[int, Union[bytes, str], Optional[bytes]]:
+        try:
+            response = client.audio.speech.create(model="tts-1", voice=voice, input=line.strip())
+            audio = response.content
+            silence_duration = random.uniform(2, 5)
+            silence = PodcastAudioGeneratorTool._generate_silence(silence_duration)
+            return index, audio, silence
+        except Exception as e:
+            return index, f"Error generating audio: {str(e)}", None
+
+    def _invoke(
+        self, user_id: str, tool_parameters: dict[str, Any]
+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        # Extract parameters
+        script = tool_parameters.get("script", "")
+        host1_voice = tool_parameters.get("host1_voice")
+        host2_voice = tool_parameters.get("host2_voice")
+
+        # Split the script into lines
+        script_lines = [line for line in script.split("\n") if line.strip()]
+
+        # Ensure voices are provided
+        if not host1_voice or not host2_voice:
+            raise ToolParameterValidationError("Host voices are required")
+
+        # Get OpenAI API key from credentials
+        if not self.runtime or not self.runtime.credentials:
+            raise ToolProviderCredentialValidationError("Tool runtime or credentials are missing")
+        api_key = self.runtime.credentials.get("api_key")
+        if not api_key:
+            raise ToolProviderCredentialValidationError("OpenAI API key is missing")
+
+        # Initialize OpenAI client
+        client = openai.OpenAI(api_key=api_key)
+
+        # Create a thread pool
+        max_workers = 5
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for i, line in enumerate(script_lines):
+                voice = host1_voice if i % 2 == 0 else host2_voice
+                future = executor.submit(self._generate_audio_segment, client, line, voice, i)
+                futures.append(future)
+
+            # Collect results
+            audio_segments: list[Any] = [None] * len(script_lines)
+            for future in concurrent.futures.as_completed(futures):
+                index, audio, silence = future.result()
+                if isinstance(audio, str):  # Error occurred
+                    return self.create_text_message(audio)
+                audio_segments[index] = (audio, silence)
+
+        # Combine audio segments in the correct order
+        combined_audio = b""
+        for i, (audio, silence) in enumerate(audio_segments):
+            if audio:
+                combined_audio += audio
+                if i < len(audio_segments) - 1 and silence:
+                    combined_audio += silence
+
+        # Create a blob message with the combined audio
+        return [
+            self.create_text_message("Audio generated successfully"),
+            self.create_blob_message(
+                blob=combined_audio,
+                meta={"mime_type": "audio/mpeg"},
+                save_as=self.VariableKey.AUDIO,
+            ),
+        ]
diff --git a/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.yaml b/api/core/tools/provider/builtin/podcast_generator/tools/podcast_audio_generator.yaml
@@ -0,0 +1,95 @@
+identity:
+  name: podcast_audio_generator
+  author: Dify
+  label:
+    en_US: Podcast Audio Generator
+    zh_Hans: 播客音频生成器
+description:
+  human:
+    en_US: Generate a podcast audio file from a script with two alternating voices using OpenAI's TTS service.
+    zh_Hans: 使用 OpenAI 的 TTS 服务，从包含两个交替声音的脚本生成播客音频文件。
+  llm: This tool converts a prepared podcast script into an audio file using OpenAI's Text-to-Speech service, with two specified voices for alternating hosts.
+parameters:
+  - name: script
+    type: string
+    required: true
+    label:
+      en_US: Podcast Script
+      zh_Hans: 播客脚本
+    human_description:
+      en_US: A string containing alternating lines for two hosts, separated by newline characters.
+      zh_Hans: 包含两位主持人交替台词的字符串，每行用换行符分隔。
+    llm_description: A string representing the script, with alternating lines for two hosts separated by newline characters.
+    form: llm
+  - name: host1_voice
+    type: select
+    required: true
+    label:
+      en_US: Host 1 Voice
+      zh_Hans: 主持人1 音色
+    human_description:
+      en_US: The voice for the first host.
+      zh_Hans: 第一位主持人的音色。
+    llm_description: The voice identifier for the first host's voice.
+    options:
+      - label:
+          en_US: Alloy
+          zh_Hans: Alloy
+        value: alloy
+      - label:
+          en_US: Echo
+          zh_Hans: Echo
+        value: echo
+      - label:
+          en_US: Fable
+          zh_Hans: Fable
+        value: fable
+      - label:
+          en_US: Onyx
+          zh_Hans: Onyx
+        value: onyx
+      - label:
+          en_US: Nova
+          zh_Hans: Nova
+        value: nova
+      - label:
+          en_US: Shimmer
+          zh_Hans: Shimmer
+        value: shimmer
+    form: form
+  - name: host2_voice
+    type: select
+    required: true
+    label:
+      en_US: Host 2 Voice
+      zh_Hans: 主持人2 音色
+    human_description:
+      en_US: The voice for the second host.
+      zh_Hans: 第二位主持人的音色。
+    llm_description: The voice identifier for the second host's voice.
+    options:
+      - label:
+          en_US: Alloy
+          zh_Hans: Alloy
+        value: alloy
+      - label:
+          en_US: Echo
+          zh_Hans: Echo
+        value: echo
+      - label:
+          en_US: Fable
+          zh_Hans: Fable
+        value: fable
+      - label:
+          en_US: Onyx
+          zh_Hans: Onyx
+        value: onyx
+      - label:
+          en_US: Nova
+          zh_Hans: Nova
+        value: nova
+      - label:
+          en_US: Shimmer
+          zh_Hans: Shimmer
+        value: shimmer
+    form: form