Merge branch 'develop'

microsoft · Nov 26, 2024 · a284bb2 · a284bb2
2 parents 5ca7ac2 + 420f2e2
commit a284bb2
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -637,7 +637,8 @@ Conversation options are represented as features. They can be configured from Ap
 | `recognition_retry_max` | The maximum number of retries for voice recognition. | `int` | 2 |
 | `recording_enabled` | Whether call recording is enabled. | `bool` | false |
 | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false |
-| `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 500 |
+| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in seconds. | `int` | 400 |
+| `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 400 |
 | `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 |
 
 ### Use an OpenAI compatible model for the LLM

diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py
@@ -13,6 +13,7 @@
 from azure.communication.callautomation.aio import CallAutomationClient
 from openai import APIError
 from pydub import AudioSegment
+from pydub.effects import high_pass_filter, low_pass_filter
 
 from app.helpers.call_utils import (
     handle_clear_queue,
@@ -24,6 +25,7 @@
 from app.helpers.features import (
     answer_hard_timeout_sec,
     answer_soft_timeout_sec,
+    vad_cutoff_timeout_ms,
     vad_silence_timeout_ms,
     vad_threshold,
 )
@@ -65,6 +67,7 @@ async def load_llm_chat(  # noqa: PLR0913
     # Init language recognition
     speech_token = await (await token("https://cognitiveservices.azure.com/.default"))()
     recognizer_buffer: list[str] = []
+    recognizer_store_next_recognition = False
     recognizer_lock = asyncio.Event()
     recognizer_stream = PushAudioInputStream(
         stream_format=AudioStreamFormat(
@@ -86,25 +89,44 @@ async def load_llm_chat(  # noqa: PLR0913
 
     def _handle_partial_recognition(event: SpeechRecognitionEventArgs) -> None:
         text = event.result.text
+
         # Skip if no text
         if not text:
             return
+
         # Init buffer if empty
         if not recognizer_buffer:
             recognizer_buffer.append("")
+
         # Replace last element by this update
         recognizer_buffer[-1] = text
         logger.debug("Partial recognition: %s", recognizer_buffer)
+
         # Lock the recognition until the audio stream is ready
         recognizer_lock.set()
 
     def _handle_complete_recognition(event: SpeechRecognitionEventArgs) -> None:
         text = event.result.text
+
         # Skip if no text
         if not text:
             return
+
         # Replace last element by this update
         recognizer_buffer[-1] = text
+
+        # If recognition requires to be stored, add it to the call history
+        nonlocal recognizer_store_next_recognition
+        if recognizer_store_next_recognition:
+            recognizer_store_next_recognition = False
+            logger.info("Voice stored: %s", recognizer_buffer)
+            call.messages.append(
+                MessageModel(
+                    content=" ".join(recognizer_buffer),
+                    persona=MessagePersonaEnum.HUMAN,
+                )
+            )
+
         # Add a new element to the buffer, thus the next partial recognition will be in a new element
         recognizer_buffer.append("")
         logger.debug("Complete recognition: %s", recognizer_buffer)
@@ -155,14 +177,10 @@ async def _response_callback() -> None:
             if not recognizer_buffer or recognizer_buffer[-1] == "":
                 return
 
-            # Add recognition to the call history
-            logger.info("Voice recognition: %s", recognizer_buffer)
-            call.messages.append(
-                MessageModel(
-                    content=" ".join(recognizer_buffer),
-                    persona=MessagePersonaEnum.HUMAN,
-                )
-            )
+            # Set recognition to be added to the call history
+            logger.info("Voice recognized: %s", recognizer_buffer)
+            nonlocal recognizer_store_next_recognition
+            recognizer_store_next_recognition = True
 
             # Add recognition to the call history
             nonlocal last_response
@@ -172,6 +190,7 @@ async def _response_callback() -> None:
                     client=automation_client,
                     post_callback=post_callback,
                     scheduler=scheduler,
+                    text=" ".join(recognizer_buffer),
                     training_callback=training_callback,
                 )
             )
@@ -197,6 +216,7 @@ async def _out_answer(  # noqa: PLR0915
     client: CallAutomationClient,
     post_callback: Callable[[CallStateModel], Awaitable[None]],
     scheduler: aiojobs.Scheduler,
+    text: str,
     training_callback: Callable[[CallStateModel], Awaitable[None]],
     _iterations_remaining: int = 3,
 ) -> CallStateModel:
@@ -241,8 +261,9 @@ async def _tts_callback(text: str, style: MessageStyleEnum) -> None:
             call=call,
             client=client,
             post_callback=post_callback,
-            use_tools=_iterations_remaining > 0,
+            text=text,
             tts_callback=_tts_callback,
+            use_tools=_iterations_remaining > 0,
         )
     )
 
@@ -342,6 +363,7 @@ def _clear_tasks() -> None:
                 client=client,
                 post_callback=post_callback,
                 scheduler=scheduler,
+                text=text,
                 training_callback=training_callback,
                 _iterations_remaining=_iterations_remaining - 1,
             )
@@ -352,6 +374,7 @@ def _clear_tasks() -> None:
             client=client,
             post_callback=post_callback,
             scheduler=scheduler,
+            text=text,
             training_callback=training_callback,
             _iterations_remaining=_iterations_remaining - 1,
         )  # Recursive chat (like for for retry or tools)
@@ -368,6 +391,7 @@ async def _execute_llm_chat(  # noqa: PLR0911, PLR0912, PLR0915
     call: CallStateModel,
     client: CallAutomationClient,
     post_callback: Callable[[CallStateModel], Awaitable[None]],
+    text: str,
     tts_callback: Callable[[str, MessageStyleEnum], Awaitable[None]],
     use_tools: bool,
 ) -> tuple[bool, bool, CallStateModel]:
@@ -437,14 +461,23 @@ async def _content_callback(
         tools = await plugins.to_openai()
         logger.debug("Tools: %s", tools)
 
+    # Add user message in a temporary current context
+    call_copy = call.model_copy()
+    call_copy.messages.append(
+        MessageModel(
+            content=text,
+            persona=MessagePersonaEnum.HUMAN,
+        )
+    )
+
     # Execute LLM inference
     maximum_tokens_reached = False
     content_buffer_pointer = 0
     tool_calls_buffer: dict[int, MessageToolModel] = {}
     try:
         async for delta in completion_stream(
             max_tokens=160,  # Lowest possible value for 90% of the cases, if not sufficient, retry will be triggered, 100 tokens ~= 75 words, 20 words ~= 1 sentence, 6 sentences ~= 160 tokens
-            messages=call.messages,
+            messages=call_copy.messages,
             system=system,
             tools=tools,
         ):
@@ -554,28 +587,21 @@ async def _in_audio(  # noqa: PLR0913
     clear_tts_task: asyncio.Task | None = None
     flush_task: asyncio.Task | None = None
 
-    # Init VAD parameters
-    rms_threshold = await vad_threshold()
-    sample_width = bits_per_sample // 8
-    silence_duration_ms = await vad_silence_timeout_ms()
-
     async def _flush_callback() -> None:
         """
         Flush the audio buffer if no audio is detected for a while.
         """
         nonlocal clear_tts_task
 
         # Wait for the timeout
-        await asyncio.sleep(silence_duration_ms / 1000)
+        await asyncio.sleep(await vad_silence_timeout_ms() / 1000)
 
         # Cancel the TTS clear task if any
         if clear_tts_task:
             clear_tts_task.cancel()
             clear_tts_task = None
 
-        logger.debug(
-            "Timeout triggered after %ims, flushing audio buffer", silence_duration_ms
-        )
+        logger.debug("Timeout triggered, flushing audio buffer")
 
         # Commit the buffer
         await response_callback()
@@ -586,8 +612,8 @@ async def _clear_tts_callback() -> None:
 
         Start is the index of the buffer where the TTS was triggered.
         """
-        # Wait 200ms before clearing the TTS queue
-        await asyncio.sleep(0.2)
+        # Wait before clearing the TTS queue
+        await asyncio.sleep(await vad_cutoff_timeout_ms() / 1000)
 
         logger.debug("Voice detected, cancelling TTS")
 
@@ -604,19 +630,26 @@ async def _clear_tts_callback() -> None:
             channels=channels,
             data=in_chunck,
             frame_rate=sample_rate,
-            sample_width=sample_width,
+            sample_width=bits_per_sample // 8,
         )
 
         # Confirm ASAP that the event is processed
         in_stream.task_done()
 
+        # Apply high-pass and low-pass filters in a simple attempt to reduce noise
+        in_audio = high_pass_filter(in_audio, 200)
+        in_audio = low_pass_filter(in_audio, 3000)
+
         # Always add the audio to the buffer
         assert isinstance(in_audio.raw_data, bytes)
         out_stream.write(in_audio.raw_data)
 
         # Get the relative dB, silences shoudl be at 1 to 5% of the max, so 0.1 to 0.5 of the threshold
         in_empty = False
-        if min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1) < rms_threshold:
+        if (
+            min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1)
+            < await vad_threshold()
+        ):
             in_empty = True
             # Start timeout if not already started and VAD already triggered
             if not flush_task:

diff --git a/app/helpers/features.py b/app/helpers/features.py
@@ -29,7 +29,11 @@ async def callback_timeout_hour() -> int:
 
 
 async def vad_silence_timeout_ms() -> int:
-    return await _get(key="vad_silence_timeout_ms", type_res=int) or 500
+    return await _get(key="vad_silence_timeout_ms", type_res=int) or 400
+
+
+async def vad_cutoff_timeout_ms() -> int:
+    return await _get(key="vad_cutoff_timeout_ms", type_res=int) or 400
 
 
 async def vad_threshold() -> float:

diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep
@@ -565,72 +565,72 @@ resource contentfilter 'Microsoft.CognitiveServices/accounts/raiPolicies@2024-06
       }
       // Prompt
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'hate'
         severityThreshold: 'High'
         source: 'Prompt'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'sexual'
         severityThreshold: 'High'
         source: 'Prompt'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'selfharm'
         severityThreshold: 'High'
         source: 'Prompt'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'violence'
         severityThreshold: 'High'
         source: 'Prompt'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'profanity'
         severityThreshold: 'High'
         source: 'Prompt'
       }
       // Completion
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'hate'
         severityThreshold: 'High'
         source: 'Completion'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'sexual'
         severityThreshold: 'High'
         source: 'Completion'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'selfharm'
         severityThreshold: 'High'
         source: 'Completion'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'violence'
         severityThreshold: 'High'
         source: 'Completion'
       }
       {
-        blocking: !promptContentFilter
-        enabled: !promptContentFilter
+        blocking: promptContentFilter
+        enabled: promptContentFilter
         name: 'profanity'
         severityThreshold: 'High'
         source: 'Completion'
@@ -905,7 +905,8 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@
     recognition_retry_max: 2
     recording_enabled: false
     slow_llm_for_chat: false
-    vad_silence_timeout_ms: 500
+    vad_cutoff_timeout_ms: 400
+    vad_silence_timeout_ms: 400
     vad_threshold: '0.5'
   }): {
     parent: configStore