From aa10dd67e47e09952e6298793e614e29f5b2d4d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Mon, 15 Jan 2024 15:46:43 +0100
Subject: [PATCH 1/3] refacto: Simpler static sound integrations

---
 helpers/prompts.py |  5 +++++
 main.py            | 14 +++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/helpers/prompts.py b/helpers/prompts.py
index e211158b..e20952c2 100644
--- a/helpers/prompts.py
+++ b/helpers/prompts.py
@@ -2,6 +2,11 @@
 from helpers.config import CONFIG
 
 
+class Sounds(str, Enum):
+    LOADING = f"{CONFIG.resources.public_url}/loading.wav"
+    READY = f"{CONFIG.resources.public_url}/ready.wav"
+
+
 class LLM(str, Enum):
     DEFAULT_SYSTEM = f"""
         Assistant called {CONFIG.workflow.bot_name} and is in a call center for the insurance company {CONFIG.workflow.bot_company} as an expert with 20 years of experience. Today is {{date}}. Customer is calling from {{phone_number}}. Call center number is {CONFIG.communication_service.phone_number}.
diff --git a/main.py b/main.py
index 8daf3809..9749ae31 100644
--- a/main.py
+++ b/main.py
@@ -23,7 +23,7 @@
 from fastapi.responses import JSONResponse
 from helpers.config import CONFIG
 from helpers.logging import build_logger
-from helpers.prompts import LLM as LLMPrompt, TTS as TTSPrompt
+from helpers.prompts import LLM as LLMPrompt, TTS as TTSPrompt, Sounds as SoundPrompt
 from helpers.version import VERSION
 from models.action import ActionModel, Indent as IndentAction
 from models.reminder import ReminderModel
@@ -392,7 +392,7 @@ async def intelligence(call: CallModel, client: CallConnectionClient) -> None:
     await handle_media_loop(
         call=call,
         client=client,
-        file="loading.wav",
+        sound=SoundPrompt.LOADING,
     )
 
     chat_res = await gpt_chat(call)
@@ -821,14 +821,14 @@ async def handle_recognize_text(
     await handle_recognize_media(
         call=call,
         client=client,
-        file="ready.wav",
+        sound=SoundPrompt.READY,
     )
 
 
 async def handle_recognize_media(
     client: CallConnectionClient,
     call: CallModel,
-    file: str,
+    sound: SoundPrompt,
 ) -> None:
     """
     Play a media to a call participant and start recognizing the response.
@@ -839,7 +839,7 @@ async def handle_recognize_media(
         client.start_recognizing_media(
             end_silence_timeout=3,  # Sometimes user includes breaks in their speech
             input_type=RecognizeInputType.SPEECH,
-            play_prompt=FileSource(f"{CONFIG.resources.public_url}/{file}"),
+            play_prompt=FileSource(url=sound),
             speech_language=CONFIG.workflow.conversation_lang,
             target_participant=PhoneNumberIdentifier(call.phone_number),
         )
@@ -850,14 +850,14 @@ async def handle_recognize_media(
 async def handle_media_loop(
     client: CallConnectionClient,
     call: CallModel,
-    file: str,
+    sound: SoundPrompt,
     context: Optional[str] = None,
 ) -> None:
     try:
         client.play_media_to_all(
             loop=True,
             operation_context=context,
-            play_source=FileSource(f"{CONFIG.resources.public_url}/{file}"),
+            play_source=FileSource(url=sound),
         )
     except ResourceNotFoundError:
         _logger.debug(f"Call hung up before playing ({call.id})")

From 6fa347ac0a32f7ce98cf6084ccd0a62a9ad7e17d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Mon, 15 Jan 2024 15:47:11 +0100
Subject: [PATCH 2/3] fix: Fine-tune hello static prompt

---
 helpers/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers/prompts.py b/helpers/prompts.py
index e20952c2..17c8b9e1 100644
--- a/helpers/prompts.py
+++ b/helpers/prompts.py
@@ -70,6 +70,6 @@ class TTS(str, Enum):
         "Je suis désolé, j'ai rencontré une erreur. Pouvez-vous répéter votre demande ?"
     )
     GOODBYE = f"Merci de votre appel, j'espère avoir pu vous aider. N'hésitez pas à rappeler, j'ai tout mémorisé. {CONFIG.workflow.bot_company} vous souhaite une excellente journée !"
-    HELLO = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je suis spécialiste des sinistres. Je ne peux pas travailler et écouter en même temps. Lorsque je travaillerai, vous entendrez une petite musique. Après, au bip, ce sera à votre tour de parler. Je suis là pour vous aider. Quel est l'objet de votre appel ?"
+    HELLO = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je suis spécialiste des sinistres. Je ne peux pas travailler et écouter en même temps. Voici comment je fonctionne  : lorsque je travaillerai, vous entendrez une petite musique ; après, au bip, ce sera à votre tour de parler. Je suis là pour vous aider. Quel est l'objet de votre appel ?"
     TIMEOUT_SILENCE = "Je suis désolé, je n'ai rien entendu. Si vous avez besoin d'aide, dites-moi comment je peux vous aider."
     WELCOME_BACK = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je vois que vous avez déjà appelé il y a moins de {CONFIG.workflow.conversation_timeout_hour} heures. Laissez-moi quelques secondes pour récupérer votre dossier..."

From 1984d553f630871eea5c9cde753f10b24650cc59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Mon, 15 Jan 2024 15:47:30 +0100
Subject: [PATCH 3/3] fix: Make bot speak slower

---
 main.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 9749ae31..cd988ee9 100644
--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@
     FileSource,
     PhoneNumberIdentifier,
     RecognizeInputType,
-    TextSource,
+    SsmlSource,
 )
 from azure.communication.sms import SmsClient
 from azure.core.credentials import AzureKeyCredential
@@ -910,17 +910,20 @@ async def handle_hangup(client: CallConnectionClient, call: CallModel) -> None:
         _logger.warn(f"Failed SMS to {call.phone_number} ({call.id})", exc_info=True)
 
 
-def audio_from_text(text: str) -> TextSource:
+def audio_from_text(text: str) -> SsmlSource:
+    """
+    Generate an audio source that can be read by Azure Communication Services SDK.
+
+    Text requires to be SVG escaped, and SSML tags are used to control the voice. Plus, text is slowed down by 5% to make it more understandable for elderly people. Text is also truncated to 400 characters, as this is the limit of Azure Communication Services TTS, but a warning is logged.
+    """
+    # Azure Speech Service TTS limit is 400 characters
     if len(text) > 400:
         _logger.warning(
             f"Text is too long to be processed by TTS, truncating to 400 characters, fix this!"
         )
         text = text[:400]
-    return TextSource(
-        source_locale=CONFIG.workflow.conversation_lang,
-        text=text,
-        voice_name=CONFIG.communication_service.voice_name,
-    )
+    ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{CONFIG.workflow.conversation_lang}"><voice name="{CONFIG.communication_service.voice_name}" effect="eq_telecomhp8k"><prosody rate="0.95">{text}</prosody></voice></speak>'
+    return SsmlSource(ssml_text=ssml)
 
 
 def callback_url(caller_id: str) -> str: