diff --git a/README.md b/README.md index dfd44d7..38e14a8 100644 --- a/README.md +++ b/README.md @@ -498,8 +498,8 @@ Conversation options are represented as features. They can be configured from Ap | `recognition_retry_max` | The maximum number of retries for voice recognition. | `int` | 2 | | `recording_enabled` | Whether call recording is enabled. | `bool` | false | | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false | -| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in secs. | `int` | 600 | -| `vad_silence_timeout_ms` | The timeout for phone silence in secs. | `int` | 400 | +| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in secs. | `int` | 300 | +| `vad_silence_timeout_ms` | The timeout for phone silence in secs. | `int` | 500 | | `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 | ### Use an OpenAI compatible model for the LLM diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index 44b5489..00ae850 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -167,16 +167,23 @@ async def _stop_callback() -> None: stt_buffer.clear() stt_complete_gate.clear() + # Clear the audio buffer + while not audio_out.empty(): + audio_out.get_nowait() + audio_out.task_done() + # Send a stop signal await audio_out.put(False) - async def _commit_answer(tool_blacklist: set[str] | None = None) -> None: + async def _commit_answer( + wait: bool, + tool_blacklist: set[str] | None = None, + ) -> None: """ Process the response. - """ - # Stop any previous response - await _stop_callback() + Start the chat task and wait for its response if needed. Job is stored in `last_response` shared variable. + """ # Start chat task nonlocal last_response last_response = await scheduler.spawn( @@ -192,7 +199,8 @@ async def _commit_answer(tool_blacklist: set[str] | None = None) -> None: ) # Wait for its response - await last_response.wait() + if wait: + await last_response.wait() async def _response_callback(_retry: bool = False) -> None: """ @@ -204,7 +212,7 @@ async def _response_callback(_retry: bool = False) -> None: try: await asyncio.wait_for(stt_complete_gate.wait(), timeout=0.05) except TimeoutError: - pass + logger.debug("Complete recognition timeout, using partial recognition") stt_text = " ".join(stt_buffer).strip() @@ -217,8 +225,11 @@ async def _response_callback(_retry: bool = False) -> None: await asyncio.sleep(0.2) return await _response_callback(_retry=True) + # Stop any previous response + await _stop_callback() + # Add it to the call history and update last interaction - logger.info("Voice stored: %s", stt_buffer) + logger.info("Voice stored: %s", stt_text) async with _db.call_transac( call=call, scheduler=scheduler, @@ -231,11 +242,8 @@ async def _response_callback(_retry: bool = False) -> None: ) ) - # Clear the recognition buffer - stt_buffer.clear() - # Process the response - await _commit_answer() + await _commit_answer(wait=True) # First call if len(call.messages) <= 1: @@ -250,7 +258,8 @@ async def _response_callback(_retry: bool = False) -> None: else: # Welcome with the LLM, do not use the end call tool for the first message, LLM hallucinates it and this is extremely frustrating for the user await _commit_answer( - {"end_call"}, + tool_blacklist={"end_call"}, + wait=False, ) await asyncio.gather( @@ -720,9 +729,8 @@ async def _wait_for_stop() -> None: # Wait before clearing the TTS queue await asyncio.sleep(timeout_ms / 1000) - logger.debug("Canceling TTS after %i ms", timeout_ms) - # Clear the queue + logger.info("Stoping TTS after %i ms", timeout_ms) await stop_callback() while True: diff --git a/app/helpers/call_utils.py b/app/helpers/call_utils.py index 67e0533..1076706 100644 --- a/app/helpers/call_utils.py +++ b/app/helpers/call_utils.py @@ -84,14 +84,6 @@ def write(self, audio_buffer: memoryview) -> int: self.queue.put_nowait(audio_buffer.tobytes()) return audio_buffer.nbytes - def close(self) -> None: - """ - Close the callback. - """ - while not self.queue.empty(): - self.queue.get_nowait() - self.queue.task_done() - class ContextEnum(str, Enum): """ @@ -735,16 +727,14 @@ async def _process_one(self, input_pcm: bytes) -> None: # Apply noise reduction reduced_signal = reduce_noise( # Input signal - clip_noise_stationary=False, sr=self._sample_rate, y=input_signal, - # Performance - n_fft=self._chunk_size, + # Quality + n_fft=128, # Since the reference signal is already noise-reduced, we can assume it's stationary + clip_noise_stationary=False, # Noise is longer than the signal stationary=True, y_noise=self._bot_voice_buffer, - # Output quality - prop_decrease=0.75, # Reduce noise by 75% ) # Perform VAD test diff --git a/app/helpers/features.py b/app/helpers/features.py index 392f773..2df8fa1 100644 --- a/app/helpers/features.py +++ b/app/helpers/features.py @@ -59,7 +59,7 @@ async def vad_threshold() -> float: async def vad_silence_timeout_ms() -> int: return await _default( - default=400, + default=500, key="vad_silence_timeout_ms", type_res=int, ) @@ -67,7 +67,7 @@ async def vad_silence_timeout_ms() -> int: async def vad_cutoff_timeout_ms() -> int: return await _default( - default=600, + default=300, key="vad_cutoff_timeout_ms", type_res=int, ) diff --git a/app/main.py b/app/main.py index 0ffdaf4..18d7e94 100644 --- a/app/main.py +++ b/app/main.py @@ -625,6 +625,7 @@ async def _send_audio() -> None: ) # Stop audio elif audio_data is False: + logger.debug("Stop audio event received, stopping audio") await websocket.send_json( { "kind": "StopAudio", diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep index 16e60eb..f7ffafe 100644 --- a/cicd/bicep/app.bicep +++ b/cicd/bicep/app.bicep @@ -907,8 +907,8 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@ recognition_retry_max: 2 recording_enabled: false slow_llm_for_chat: false - vad_cutoff_timeout_ms: 600 - vad_silence_timeout_ms: 400 + vad_cutoff_timeout_ms: 300 + vad_silence_timeout_ms: 500 vad_threshold: '0.5' }): { parent: configStore