mozilla-ai · daavoo · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 5, 2024
diff --git a/.github/setup.sh b/.github/setup.sh
@@ -3,3 +3,4 @@ git clone https://github.com/descriptinc/audiotools
 python -m pip install audiotools
 python -m pip install -e .
 rm -rf audiotools
+python -m pip install --upgrade streamlit
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -30,5 +30,12 @@ jobs:
       - name: Install
         run: pip install -e '.[tests]'
 
-      - name: Run tests
-        run: pytest -v tests
+      - name: Run Unit Tests
+        run: pytest -v tests/unit
+
+      - name: Run Integration Tests
+        run: pytest -v tests/integration
+
+      - name: Run E2E tests
+        if: ${{ github.event_name == 'workflow_dispatch' }}
+        run: pytest -v tests/e2e
diff --git a/demo/app.py b/demo/app.py
@@ -2,42 +2,19 @@
 from pathlib import Path
 
 import numpy as np
+import soundfile as sf
 import streamlit as st
 
-from document_to_podcast.podcast_maker.script_to_audio import save_waveform_as_file
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
     load_parler_tts_model_and_tokenizer,
 )
+from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
 from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.inference.text_to_text import text_to_text_stream
 
 
-PODCAST_PROMPT = """
-You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
-Speaker 1: Laura, the main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
-Speaker 2: Jon, the co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like “hmm” or “umm.”
-Instructions:
-- Write dynamic, easy-to-follow dialogue.
-- Include natural interruptions and interjections.
-- Avoid repetitive phrasing between speakers.
-- Format output as a JSON conversation.
-Example:
-{
-  "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
-  "Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
-  "Speaker 1": "Sure! Imagine it like this...",
-  "Speaker 2": "Oh, that's cool! But how does..."
-}
-"""
-
-SPEAKER_DESCRIPTIONS = {
-    "1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
-    "2": "Jon's voice is calm with very clear audio and no background noise.",
-}
-
-
 @st.cache_resource
 def load_text_to_text_model():
     return load_llama_cpp_model(
@@ -138,9 +115,18 @@ def gen_button_clicked():
     )
     st.divider()
 
-    system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
+    st.subheader("Speaker configuration")
+    for s in DEFAULT_SPEAKERS:
+        s.pop("id", None)
+    speakers = st.data_editor(DEFAULT_SPEAKERS, num_rows="dynamic")
 
     if st.button("Generate Podcast", on_click=gen_button_clicked):
+        for n, speaker in enumerate(speakers):
+            speaker["id"] = n + 1
+        system_prompt = DEFAULT_PROMPT.replace(
+            "{SPEAKERS}",
+            "\n".join(str(Speaker.model_validate(speaker)) for speaker in speakers),
+        )
         with st.spinner("Generating Podcast..."):
             text = ""
             for chunk in text_to_text_stream(
@@ -152,12 +138,17 @@ def gen_button_clicked():
                     st.write(st.session_state.script)
 
                     speaker_id = re.search(r"Speaker (\d+)", text).group(1)
+                    tone = next(
+                        speaker["tone"]
+                        for speaker in speakers
+                        if speaker["id"] == int(speaker_id)
+                    )
                     with st.spinner("Generating Audio..."):
                         speech = text_to_speech(
                             text.split(f'"Speaker {speaker_id}":')[-1],
                             speech_model,
                             speech_tokenizer,
-                            SPEAKER_DESCRIPTIONS[speaker_id],
+                            tone,
                         )
                     st.audio(speech, sample_rate=speech_model.config.sampling_rate)
                     st.session_state.audio.append(speech)
@@ -166,10 +157,10 @@ def gen_button_clicked():
     if st.session_state[gen_button]:
         if st.button("Save Podcast to audio file"):
             st.session_state.audio = np.concatenate(st.session_state.audio)
-            save_waveform_as_file(
-                waveform=st.session_state.audio,
-                sampling_rate=speech_model.config.sampling_rate,
-                filename="podcast.wav",
+            sf.write(
+                "podcast.wav",
+                st.session_state.audio,
+                samplerate=44100,
             )
             st.markdown("Podcast saved to disk!")
 

diff --git a/docs/api.md b/docs/api.md
@@ -7,7 +7,3 @@
 ::: document_to_podcast.inference.text_to_text
 
 ::: document_to_podcast.inference.text_to_speech
-
-::: document_to_podcast.podcast_maker.script_to_audio
-
-::: document_to_podcast.podcast_maker.config
diff --git a/docs/cli.md b/docs/cli.md
@@ -0,0 +1,30 @@
+# Command Line Interface
+
+Once you have [installed the blueprint](./getting-started.md), you can use it from the CLI.
+
+You can either provide the path to a configuration file:
+
+```bash
+document-to-podcast --from_config "example_data/config.yaml"
+```
+
+Or provide values to the arguments directly:
+
+
+```bash
+document-to-podcast \
+--input_file "example_data/Mozilla-Trustworthy_AI.pdf" \
+--output_folder "example_data"
+--text_to_text_model "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
+```
+
+---
+
+::: document_to_podcast.cli.document_to_podcast
+
+---
+
+::: document_to_podcast.config.Config
+::: document_to_podcast.config.Speaker
+::: document_to_podcast.config.DEFAULT_PROMPT
+::: document_to_podcast.config.DEFAULT_SPEAKERS
diff --git a/example_data/config.yaml b/example_data/config.yaml
@@ -0,0 +1,30 @@
+input_file: "example_data/introducing-mozilla-ai-investing-in-trustworthy-ai.html"
+output_folder: "example_data/"
+text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+text_to_speech_model: "parler-tts/parler-tts-mini-v1"
+text_to_text_prompt: |
+  You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. 
+  The script features the following speakers:
+  {SPEAKERS}
+  Instructions:
+  - Write dynamic, easy-to-follow dialogue.
+  - Include natural interruptions and interjections.
+  - Avoid repetitive phrasing between speakers.
+  - Format output as a JSON conversation.
+  Example:
+  {
+    "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
+    "Speaker 2": "Hi! I'm excited to hear about this. Can you explain...",
+    "Speaker 1": "Sure! Imagine it like this...",
+    "Speaker 2": "Oh, that's cool! But how does..."
+  }
+sampling_rate: 44100
+speakers:
+  - id: 1
+    name: Laura
+    description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
+    tone: Laura's voice is exciting and fast in delivery with very clear audio and no background noise.
+  - id: 2
+    name: Jon
+    description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
+    tone: Jon's voice is calm with very clear audio and no background noise.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -7,6 +7,7 @@ nav:
   - Getting Started: getting-started.md
   - Step-by-Step Guide: step-by-step-guide.md
   - Customization Guide: customization.md
+  - Command Line Interface: cli.md
   - API Reference: api.md
   - Future Features & Contributions: future-features-contributions.md
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ docs = [
 tests = [
   "pytest>=8,<9",
   "pytest-sugar>=0.9.6",
+  "pytest-mock>=3.14.0"
 ]
 
 [project.urls]
@@ -43,3 +44,6 @@ where = ["src"]
 namespaces = false
 
 [tool.setuptools_scm]
+
+[project.scripts]
+document-to-podcast = "document_to_podcast.cli:main"
diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -0,0 +1,158 @@
+import re
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+import yaml
+from fire import Fire
+from loguru import logger
+
+
+from document_to_podcast.config import Config, Speaker, DEFAULT_PROMPT, DEFAULT_SPEAKERS
+from document_to_podcast.inference.model_loaders import (
+    load_llama_cpp_model,
+    load_parler_tts_model_and_tokenizer,
+)
+from document_to_podcast.inference.text_to_text import text_to_text_stream
+from document_to_podcast.inference.text_to_speech import text_to_speech
+from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS
+
+
+@logger.catch(reraise=True)
+def document_to_podcast(
+    input_file: str | None = None,
+    output_folder: str | None = None,
+    text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
+    text_to_text_prompt: str = DEFAULT_PROMPT,
+    text_to_speech_model: str = "parler-tts/parler-tts-mini-v1",
+    speakers: list[Speaker] | None = None,
+    sampling_rate: int = 44100,
+    from_config: str | None = None,
+):
+    """
+    Generate a podcast from a document.
+
+    Args:
+        input_file (str): The path to the input file.
+            Supported extensions:
+
+                - .pdf
+                - .html
+                - .txt
+                - .docx
+                - .md
+
+        output_folder (str): The path to the output folder.
+            Two files will be created:
+
+                - {output_folder}/podcast.txt
+                - {output_folder}/podcast.wav
+
+        text_to_text_model (str, optional): The path to the text-to-text model.
+
+            Need to be formatted as `owner/repo/file`.
+
+            Need to be a gguf file.
+
+            Defaults to `allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf`.
+
+        text_to_text_prompt (str, optional): The prompt for the text-to-text model.
+            Defaults to DEFAULT_PROMPT.
+
+        text_to_speech_model (str, optional): The path to the text-to-speech model.
+            Defaults to `parler-tts/parler-tts-mini-v1`.
+
+        speakers (list[Speaker] | None, optional): The speakers for the podcast.
+            Defaults to DEFAULT_SPEAKERS.
+
+        sampling_rate (int, optional): The sampling rate for the output audio.
+            Defaults to 44_100.
+
+        from_config (str, optional): The path to the config file. Defaults to None.
+
+            If provided, all other arguments will be ignored.
+    """
+    if from_config:
+        config = Config.model_validate(yaml.safe_load(Path(from_config).read_text()))
+    else:
+        speakers = speakers or DEFAULT_SPEAKERS
+        config = Config(
+            input_file=input_file,
+            output_folder=output_folder,
+            text_to_text_model=text_to_text_model,
+            text_to_text_prompt=text_to_text_prompt,
+            text_to_speech_model=text_to_speech_model,
+            speakers=[Speaker.model_validate(speaker) for speaker in speakers],
+            sampling_rate=sampling_rate,
+        )
+
+    output_folder = Path(config.output_folder)
+    output_folder.mkdir(parents=True, exist_ok=True)
+
+    data_loader = DATA_LOADERS[Path(config.input_file).suffix]
+    logger.info(f"Loading {config.input_file}")
+    raw_text = data_loader(config.input_file)
+    logger.debug(f"Loaded {len(raw_text)} characters")
+
+    data_cleaner = DATA_CLEANERS[Path(config.input_file).suffix]
+    logger.info(f"Cleaning {config.input_file}")
+    clean_text = data_cleaner(raw_text)
+    logger.debug(f"Cleaned {len(raw_text) - len(clean_text)} characters")
+    logger.debug(f"Length of cleaned text: {len(clean_text)}")
+
+    logger.info(f"Loading {config.text_to_text_model}")
+    text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
+    logger.info(f"Loading {config.text_to_speech_model}")
+    speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
+        model_id=config.text_to_speech_model
+    )
+
+    # ~4 characters per token is considered a reasonable default.
+    max_characters = text_model.n_ctx() * 4
+    if len(clean_text) > max_characters:
+        logger.warning(
+            f"Input text is too big ({len(clean_text)})."
+            f" Using only a subset of it ({max_characters})."
+        )
+    clean_text = clean_text[:max_characters]
+
+    logger.info("Generating Podcast...")
+    podcast_script = ""
+    text = ""
+    podcast_audio = []
+    system_prompt = config.text_to_text_prompt.strip()
+    system_prompt = system_prompt.replace(
+        "{SPEAKERS}", "\n".join(str(speaker) for speaker in config.speakers)
+    )
+    for chunk in text_to_text_stream(
+        clean_text, text_model, system_prompt=system_prompt
+    ):
+        text += chunk
+        podcast_script += chunk
+        if text.endswith("\n") and "Speaker" in text:
+            logger.debug(text)
+            speaker_id = re.search(r"Speaker (\d+)", text).group(1)
+            tone = next(
+                speaker for speaker in config.speakers if speaker.id == int(speaker_id)
+            ).tone
+            speech = text_to_speech(
+                text.split(f'"Speaker {speaker_id}":')[-1],
+                speech_model,
+                speech_tokenizer,
+                tone,
+            )
+            podcast_audio.append(speech)
+            text = ""
+
+    logger.info("Saving Podcast...")
+    sf.write(
+        str(output_folder / "podcast.wav"),
+        np.concatenate(podcast_audio),
+        samplerate=sampling_rate,
+    )
+    (output_folder / "podcast.txt").write_text(podcast_script)
+    logger.success("Done!")
+
+
+def main():
+    Fire(document_to_podcast)