Merge branch 'develop'

microsoft · Dec 11, 2024 · 69cb05d · 69cb05d
2 parents 9814796 + e3f72d5
commit 69cb05d
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -498,8 +498,8 @@ Conversation options are represented as features. They can be configured from Ap
 | `recognition_retry_max` | The maximum number of retries for voice recognition. | `int` | 2 |
 | `recording_enabled` | Whether call recording is enabled. | `bool` | false |
 | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false |
-| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in secs. | `int` | 600 |
-| `vad_silence_timeout_ms` | The timeout for phone silence in secs. | `int` | 400 |
+| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in secs. | `int` | 300 |
+| `vad_silence_timeout_ms` | The timeout for phone silence in secs. | `int` | 500 |
 | `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 |
 
 ### Use an OpenAI compatible model for the LLM

diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py
@@ -167,16 +167,23 @@ async def _stop_callback() -> None:
             stt_buffer.clear()
             stt_complete_gate.clear()
 
+            # Clear the audio buffer
+            while not audio_out.empty():
+                audio_out.get_nowait()
+                audio_out.task_done()
+
             # Send a stop signal
             await audio_out.put(False)
 
-        async def _commit_answer(tool_blacklist: set[str] | None = None) -> None:
+        async def _commit_answer(
+            wait: bool,
+            tool_blacklist: set[str] | None = None,
+        ) -> None:
             """
             Process the response.
-            """
-            # Stop any previous response
-            await _stop_callback()
 
+            Start the chat task and wait for its response if needed. Job is stored in `last_response` shared variable.
+            """
             # Start chat task
             nonlocal last_response
             last_response = await scheduler.spawn(
@@ -192,7 +199,8 @@ async def _commit_answer(tool_blacklist: set[str] | None = None) -> None:
             )
 
             # Wait for its response
-            await last_response.wait()
+            if wait:
+                await last_response.wait()
 
         async def _response_callback(_retry: bool = False) -> None:
             """
@@ -204,7 +212,7 @@ async def _response_callback(_retry: bool = False) -> None:
             try:
                 await asyncio.wait_for(stt_complete_gate.wait(), timeout=0.05)
             except TimeoutError:
-                pass
+                logger.debug("Complete recognition timeout, using partial recognition")
 
             stt_text = " ".join(stt_buffer).strip()
 
@@ -217,8 +225,11 @@ async def _response_callback(_retry: bool = False) -> None:
                 await asyncio.sleep(0.2)
                 return await _response_callback(_retry=True)
 
+            # Stop any previous response
+            await _stop_callback()
+
             # Add it to the call history and update last interaction
-            logger.info("Voice stored: %s", stt_buffer)
+            logger.info("Voice stored: %s", stt_text)
             async with _db.call_transac(
                 call=call,
                 scheduler=scheduler,
@@ -231,11 +242,8 @@ async def _response_callback(_retry: bool = False) -> None:
                     )
                 )
 
-            # Clear the recognition buffer
-            stt_buffer.clear()
-
             # Process the response
-            await _commit_answer()
+            await _commit_answer(wait=True)
 
         # First call
         if len(call.messages) <= 1:
@@ -250,7 +258,8 @@ async def _response_callback(_retry: bool = False) -> None:
         else:
             # Welcome with the LLM, do not use the end call tool for the first message, LLM hallucinates it and this is extremely frustrating for the user
             await _commit_answer(
-                {"end_call"},
+                tool_blacklist={"end_call"},
+                wait=False,
             )
 
         await asyncio.gather(
@@ -720,9 +729,8 @@ async def _wait_for_stop() -> None:
         # Wait before clearing the TTS queue
         await asyncio.sleep(timeout_ms / 1000)
 
-        logger.debug("Canceling TTS after %i ms", timeout_ms)
-
         # Clear the queue
+        logger.info("Stoping TTS after %i ms", timeout_ms)
         await stop_callback()
 
     while True:

diff --git a/app/helpers/call_utils.py b/app/helpers/call_utils.py
@@ -84,14 +84,6 @@ def write(self, audio_buffer: memoryview) -> int:
         self.queue.put_nowait(audio_buffer.tobytes())
         return audio_buffer.nbytes
 
-    def close(self) -> None:
-        """
-        Close the callback.
-        """
-        while not self.queue.empty():
-            self.queue.get_nowait()
-            self.queue.task_done()
-
 
 class ContextEnum(str, Enum):
     """
@@ -735,16 +727,14 @@ async def _process_one(self, input_pcm: bytes) -> None:
         # Apply noise reduction
         reduced_signal = reduce_noise(
             # Input signal
-            clip_noise_stationary=False,
             sr=self._sample_rate,
             y=input_signal,
-            # Performance
-            n_fft=self._chunk_size,
+            # Quality
+            n_fft=128,
             # Since the reference signal is already noise-reduced, we can assume it's stationary
+            clip_noise_stationary=False,  # Noise is longer than the signal
             stationary=True,
             y_noise=self._bot_voice_buffer,
-            # Output quality
-            prop_decrease=0.75,  # Reduce noise by 75%
         )
 
         # Perform VAD test

diff --git a/app/helpers/features.py b/app/helpers/features.py
@@ -59,15 +59,15 @@ async def vad_threshold() -> float:
 
 async def vad_silence_timeout_ms() -> int:
     return await _default(
-        default=400,
+        default=500,
         key="vad_silence_timeout_ms",
         type_res=int,
     )
 
 
 async def vad_cutoff_timeout_ms() -> int:
     return await _default(
-        default=600,
+        default=300,
         key="vad_cutoff_timeout_ms",
         type_res=int,
     )

diff --git a/app/main.py b/app/main.py
@@ -625,6 +625,7 @@ async def _send_audio() -> None:
                     )
                 # Stop audio
                 elif audio_data is False:
+                    logger.debug("Stop audio event received, stopping audio")
                     await websocket.send_json(
                         {
                             "kind": "StopAudio",

diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep
@@ -907,8 +907,8 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@
     recognition_retry_max: 2
     recording_enabled: false
     slow_llm_for_chat: false
-    vad_cutoff_timeout_ms: 600
-    vad_silence_timeout_ms: 400
+    vad_cutoff_timeout_ms: 300
+    vad_silence_timeout_ms: 500
     vad_threshold: '0.5'
   }): {
     parent: configStore