Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

17 implement cli based interaction #34

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .github/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ git clone https://github.com/descriptinc/audiotools
python -m pip install audiotools
python -m pip install -e .
rm -rf audiotools
python -m pip install --upgrade streamlit
11 changes: 9 additions & 2 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,12 @@ jobs:
- name: Install
run: pip install -e '.[tests]'

- name: Run tests
run: pytest -v tests
- name: Run Unit Tests
run: pytest -v tests/unit

- name: Run Integration Tests
run: pytest -v tests/integration

- name: Run E2E tests
if: ${{ github.event_name == 'workflow_dispatch' }}
run: pytest -v tests/e2e
53 changes: 22 additions & 31 deletions demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,19 @@
from pathlib import Path

import numpy as np
import soundfile as sf
import streamlit as st

from document_to_podcast.podcast_maker.script_to_audio import save_waveform_as_file
from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
load_llama_cpp_model,
load_parler_tts_model_and_tokenizer,
)
from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
from document_to_podcast.inference.text_to_speech import text_to_speech
from document_to_podcast.inference.text_to_text import text_to_text_stream


PODCAST_PROMPT = """
You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
Speaker 1: Laura, the main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
Speaker 2: Jon, the co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like “hmm” or “umm.”
Instructions:
- Write dynamic, easy-to-follow dialogue.
- Include natural interruptions and interjections.
- Avoid repetitive phrasing between speakers.
- Format output as a JSON conversation.
Example:
{
"Speaker 1": "Welcome to our podcast! Today, we're exploring...",
"Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
"Speaker 1": "Sure! Imagine it like this...",
"Speaker 2": "Oh, that's cool! But how does..."
}
"""

SPEAKER_DESCRIPTIONS = {
"1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
"2": "Jon's voice is calm with very clear audio and no background noise.",
}


@st.cache_resource
def load_text_to_text_model():
return load_llama_cpp_model(
Expand Down Expand Up @@ -138,9 +115,18 @@ def gen_button_clicked():
)
st.divider()

system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
st.subheader("Speaker configuration")
for s in DEFAULT_SPEAKERS:
s.pop("id", None)
speakers = st.data_editor(DEFAULT_SPEAKERS, num_rows="dynamic")

if st.button("Generate Podcast", on_click=gen_button_clicked):
for n, speaker in enumerate(speakers):
speaker["id"] = n + 1
system_prompt = DEFAULT_PROMPT.replace(
"{SPEAKERS}",
"\n".join(str(Speaker.model_validate(speaker)) for speaker in speakers),
)
with st.spinner("Generating Podcast..."):
text = ""
for chunk in text_to_text_stream(
Expand All @@ -152,12 +138,17 @@ def gen_button_clicked():
st.write(st.session_state.script)

speaker_id = re.search(r"Speaker (\d+)", text).group(1)
tone = next(
speaker["tone"]
for speaker in speakers
if speaker["id"] == int(speaker_id)
)
with st.spinner("Generating Audio..."):
speech = text_to_speech(
text.split(f'"Speaker {speaker_id}":')[-1],
speech_model,
speech_tokenizer,
SPEAKER_DESCRIPTIONS[speaker_id],
tone,
)
st.audio(speech, sample_rate=speech_model.config.sampling_rate)
st.session_state.audio.append(speech)
Expand All @@ -166,10 +157,10 @@ def gen_button_clicked():
if st.session_state[gen_button]:
if st.button("Save Podcast to audio file"):
st.session_state.audio = np.concatenate(st.session_state.audio)
save_waveform_as_file(
waveform=st.session_state.audio,
sampling_rate=speech_model.config.sampling_rate,
filename="podcast.wav",
sf.write(
"podcast.wav",
st.session_state.audio,
samplerate=44100,
)
st.markdown("Podcast saved to disk!")

Expand Down
4 changes: 0 additions & 4 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,3 @@
::: document_to_podcast.inference.text_to_text

::: document_to_podcast.inference.text_to_speech

::: document_to_podcast.podcast_maker.script_to_audio

::: document_to_podcast.podcast_maker.config
30 changes: 30 additions & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Command Line Interface

Once you have [installed the blueprint](./getting-started.md), you can use it from the CLI.

You can either provide the path to a configuration file:

```bash
document-to-podcast --from_config "example_data/config.yaml"
```

Or provide values to the arguments directly:


```bash
document-to-podcast \
--input_file "example_data/Mozilla-Trustworthy_AI.pdf" \
--output_folder "example_data"
--text_to_text_model "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
Comment on lines +15 to +18
Copy link
Contributor

@Kostis-S-Z Kostis-S-Z Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you dont provide an argument, does it take the value from the config.yaml? <- I guess not.

Maybe the example should have all the arguments necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, you need to pass --from-config to make it parse a file. If you don't provide that argument, it works as a regular entrypoint.
Also, if you pass --from-config the rest of the args are just ignored (I could improve the arg parsing to make this more explicit and fail in case you pass something that will be ignored)
I considered this a good approach to support both interfaces but keeping them separated.

```

---

::: document_to_podcast.cli.document_to_podcast

---

::: document_to_podcast.config.Config
::: document_to_podcast.config.Speaker
::: document_to_podcast.config.DEFAULT_PROMPT
::: document_to_podcast.config.DEFAULT_SPEAKERS
30 changes: 30 additions & 0 deletions example_data/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
input_file: "example_data/introducing-mozilla-ai-investing-in-trustworthy-ai.html"
output_folder: "example_data/"
text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
text_to_speech_model: "parler-tts/parler-tts-mini-v1"
text_to_text_prompt: |
You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format.
The script features the following speakers:
{SPEAKERS}
Instructions:
- Write dynamic, easy-to-follow dialogue.
- Include natural interruptions and interjections.
- Avoid repetitive phrasing between speakers.
- Format output as a JSON conversation.
Example:
{
"Speaker 1": "Welcome to our podcast! Today, we're exploring...",
"Speaker 2": "Hi! I'm excited to hear about this. Can you explain...",
"Speaker 1": "Sure! Imagine it like this...",
"Speaker 2": "Oh, that's cool! But how does..."
}
sampling_rate: 44100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this actually depends on the TTS model we use. Parler uses 44.100, but Oute models use 24.000. So sampling rate should be set internally by us like this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it! I don't know why I thought it was a user-facing param

speakers:
- id: 1
name: Laura
description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
tone: Laura's voice is exciting and fast in delivery with very clear audio and no background noise.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to this, should we change the tone to profile or something that fits both the Oute and Parler case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think tone: female_1 looks too bad?

- id: 2
name: Jon
description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
tone: Jon's voice is calm with very clear audio and no background noise.
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ nav:
- Getting Started: getting-started.md
- Step-by-Step Guide: step-by-step-guide.md
- Customization Guide: customization.md
- Command Line Interface: cli.md
- API Reference: api.md
- Future Features & Contributions: future-features-contributions.md

Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ docs = [
tests = [
"pytest>=8,<9",
"pytest-sugar>=0.9.6",
"pytest-mock>=3.14.0"
]

[project.urls]
Expand All @@ -43,3 +44,6 @@ where = ["src"]
namespaces = false

[tool.setuptools_scm]

[project.scripts]
document-to-podcast = "document_to_podcast.cli:main"
158 changes: 158 additions & 0 deletions src/document_to_podcast/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import re
from pathlib import Path

import numpy as np
import soundfile as sf
import yaml
from fire import Fire
from loguru import logger


from document_to_podcast.config import Config, Speaker, DEFAULT_PROMPT, DEFAULT_SPEAKERS
from document_to_podcast.inference.model_loaders import (
load_llama_cpp_model,
load_parler_tts_model_and_tokenizer,
)
from document_to_podcast.inference.text_to_text import text_to_text_stream
from document_to_podcast.inference.text_to_speech import text_to_speech
from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS


@logger.catch(reraise=True)
def document_to_podcast(
input_file: str | None = None,
output_folder: str | None = None,
text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
text_to_text_prompt: str = DEFAULT_PROMPT,
text_to_speech_model: str = "parler-tts/parler-tts-mini-v1",
speakers: list[Speaker] | None = None,
sampling_rate: int = 44100,
from_config: str | None = None,
):
"""
Generate a podcast from a document.
Args:
input_file (str): The path to the input file.
Supported extensions:
- .pdf
- .html
- .txt
- .docx
- .md
output_folder (str): The path to the output folder.
Two files will be created:
- {output_folder}/podcast.txt
- {output_folder}/podcast.wav
text_to_text_model (str, optional): The path to the text-to-text model.
Need to be formatted as `owner/repo/file`.
Need to be a gguf file.
Defaults to `allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf`.
text_to_text_prompt (str, optional): The prompt for the text-to-text model.
Defaults to DEFAULT_PROMPT.
text_to_speech_model (str, optional): The path to the text-to-speech model.
Defaults to `parler-tts/parler-tts-mini-v1`.
speakers (list[Speaker] | None, optional): The speakers for the podcast.
Defaults to DEFAULT_SPEAKERS.
sampling_rate (int, optional): The sampling rate for the output audio.
Defaults to 44_100.
from_config (str, optional): The path to the config file. Defaults to None.
If provided, all other arguments will be ignored.
"""
if from_config:
config = Config.model_validate(yaml.safe_load(Path(from_config).read_text()))
else:
speakers = speakers or DEFAULT_SPEAKERS
config = Config(
input_file=input_file,
output_folder=output_folder,
text_to_text_model=text_to_text_model,
text_to_text_prompt=text_to_text_prompt,
text_to_speech_model=text_to_speech_model,
speakers=[Speaker.model_validate(speaker) for speaker in speakers],
sampling_rate=sampling_rate,
)

output_folder = Path(config.output_folder)
output_folder.mkdir(parents=True, exist_ok=True)

data_loader = DATA_LOADERS[Path(config.input_file).suffix]
logger.info(f"Loading {config.input_file}")
raw_text = data_loader(config.input_file)
logger.debug(f"Loaded {len(raw_text)} characters")

data_cleaner = DATA_CLEANERS[Path(config.input_file).suffix]
logger.info(f"Cleaning {config.input_file}")
clean_text = data_cleaner(raw_text)
logger.debug(f"Cleaned {len(raw_text) - len(clean_text)} characters")
logger.debug(f"Length of cleaned text: {len(clean_text)}")

logger.info(f"Loading {config.text_to_text_model}")
text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
logger.info(f"Loading {config.text_to_speech_model}")
speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
model_id=config.text_to_speech_model
)

# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
logger.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]

logger.info("Generating Podcast...")
podcast_script = ""
text = ""
podcast_audio = []
system_prompt = config.text_to_text_prompt.strip()
system_prompt = system_prompt.replace(
"{SPEAKERS}", "\n".join(str(speaker) for speaker in config.speakers)
)
for chunk in text_to_text_stream(
clean_text, text_model, system_prompt=system_prompt
):
text += chunk
podcast_script += chunk
if text.endswith("\n") and "Speaker" in text:
logger.debug(text)
speaker_id = re.search(r"Speaker (\d+)", text).group(1)
tone = next(
speaker for speaker in config.speakers if speaker.id == int(speaker_id)
).tone
speech = text_to_speech(
text.split(f'"Speaker {speaker_id}":')[-1],
speech_model,
speech_tokenizer,
tone,
)
podcast_audio.append(speech)
text = ""

logger.info("Saving Podcast...")
sf.write(
str(output_folder / "podcast.wav"),
np.concatenate(podcast_audio),
samplerate=sampling_rate,
)
(output_folder / "podcast.txt").write_text(podcast_script)
logger.success("Done!")


def main():
Fire(document_to_podcast)
Loading
Loading