From 19eb8ce900c9cb664af817ece7cc1593cb05d00c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:15:50 +0100 Subject: [PATCH 01/12] chore: Delete dead code --- app/helpers/call_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/app/helpers/call_utils.py b/app/helpers/call_utils.py index 67e0533..8d12c20 100644 --- a/app/helpers/call_utils.py +++ b/app/helpers/call_utils.py @@ -84,14 +84,6 @@ def write(self, audio_buffer: memoryview) -> int: self.queue.put_nowait(audio_buffer.tobytes()) return audio_buffer.nbytes - def close(self) -> None: - """ - Close the callback. - """ - while not self.queue.empty(): - self.queue.get_nowait() - self.queue.task_done() - class ContextEnum(str, Enum): """ From f27e24a5e549b3dbcec32c7a33c4f22ee85c335d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:16:14 +0100 Subject: [PATCH 02/12] fix: Assistant stop when user speak --- app/helpers/call_llm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index 44b5489..ff77234 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -167,6 +167,11 @@ async def _stop_callback() -> None: stt_buffer.clear() stt_complete_gate.clear() + # Clear the audio buffer + while not audio_out.empty(): + audio_out.get_nowait() + audio_out.task_done() + # Send a stop signal await audio_out.put(False) From c252d9380e1c78c298eee59eacb7ea5b54fcbad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:18:07 +0100 Subject: [PATCH 03/12] fix: Cut voice in the first seconds --- app/helpers/call_llm.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index ff77234..5cf08ea 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -175,9 +175,14 @@ async def _stop_callback() -> None: # Send a stop signal await audio_out.put(False) - async def _commit_answer(tool_blacklist: set[str] | None = None) -> None: + async def _commit_answer( + wait: bool, + tool_blacklist: set[str] | None = None, + ) -> None: """ Process the response. + + Start the chat task and wait for its response if needed. Job is stored in `last_response` shared variable. """ # Stop any previous response await _stop_callback() @@ -197,7 +202,8 @@ async def _commit_answer(tool_blacklist: set[str] | None = None) -> None: ) # Wait for its response - await last_response.wait() + if wait: + await last_response.wait() async def _response_callback(_retry: bool = False) -> None: """ @@ -240,7 +246,7 @@ async def _response_callback(_retry: bool = False) -> None: stt_buffer.clear() # Process the response - await _commit_answer() + await _commit_answer(wait=True) # First call if len(call.messages) <= 1: @@ -255,7 +261,8 @@ async def _response_callback(_retry: bool = False) -> None: else: # Welcome with the LLM, do not use the end call tool for the first message, LLM hallucinates it and this is extremely frustrating for the user await _commit_answer( - {"end_call"}, + tool_blacklist={"end_call"}, + wait=False, ) await asyncio.gather( From bcf0a529c5e24a09c9b9246b7313bffbf65f5cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:18:30 +0100 Subject: [PATCH 04/12] perf: Lower latency for cutting voice --- app/helpers/call_llm.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index 5cf08ea..40d3df3 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -184,9 +184,6 @@ async def _commit_answer( Start the chat task and wait for its response if needed. Job is stored in `last_response` shared variable. """ - # Stop any previous response - await _stop_callback() - # Start chat task nonlocal last_response last_response = await scheduler.spawn( @@ -228,6 +225,9 @@ async def _response_callback(_retry: bool = False) -> None: await asyncio.sleep(0.2) return await _response_callback(_retry=True) + # Stop any previous response + await _stop_callback() + # Add it to the call history and update last interaction logger.info("Voice stored: %s", stt_buffer) async with _db.call_transac( @@ -242,9 +242,6 @@ async def _response_callback(_retry: bool = False) -> None: ) ) - # Clear the recognition buffer - stt_buffer.clear() - # Process the response await _commit_answer(wait=True) From f474234f75a743de28ab8b478705439eb6696929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:18:40 +0100 Subject: [PATCH 05/12] dev: Enhance logging --- app/helpers/call_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index 40d3df3..ad74678 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -229,7 +229,7 @@ async def _response_callback(_retry: bool = False) -> None: await _stop_callback() # Add it to the call history and update last interaction - logger.info("Voice stored: %s", stt_buffer) + logger.info("Voice stored: %s", stt_text) async with _db.call_transac( call=call, scheduler=scheduler, From fc6ac38c946b60c0261978a7ae53426e3214204d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:18:57 +0100 Subject: [PATCH 06/12] dev: Display log when voice is cutted --- app/helpers/call_llm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index ad74678..147ae43 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -729,9 +729,8 @@ async def _wait_for_stop() -> None: # Wait before clearing the TTS queue await asyncio.sleep(timeout_ms / 1000) - logger.debug("Canceling TTS after %i ms", timeout_ms) - # Clear the queue + logger.info("Stoping TTS after %i ms", timeout_ms) await stop_callback() while True: From df6590baf2c9b1d5994e0a6e19c1a34b74e4d784 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 17:19:12 +0100 Subject: [PATCH 07/12] dev: Display debug log when audio is stopped --- app/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app/main.py b/app/main.py index 0ffdaf4..18d7e94 100644 --- a/app/main.py +++ b/app/main.py @@ -625,6 +625,7 @@ async def _send_audio() -> None: ) # Stop audio elif audio_data is False: + logger.debug("Stop audio event received, stopping audio") await websocket.send_json( { "kind": "StopAudio", From 2b893169c29f6b0443d8a72f9be50627b3dfbf9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 18:36:45 +0100 Subject: [PATCH 08/12] ux: Fine-tune voice cut delay --- README.md | 4 ++-- app/helpers/features.py | 4 ++-- cicd/bicep/app.bicep | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index dfd44d7..38e14a8 100644 --- a/README.md +++ b/README.md @@ -498,8 +498,8 @@ Conversation options are represented as features. They can be configured from Ap | `recognition_retry_max` | The maximum number of retries for voice recognition. | `int` | 2 | | `recording_enabled` | Whether call recording is enabled. | `bool` | false | | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false | -| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in secs. | `int` | 600 | -| `vad_silence_timeout_ms` | The timeout for phone silence in secs. | `int` | 400 | +| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in secs. | `int` | 300 | +| `vad_silence_timeout_ms` | The timeout for phone silence in secs. | `int` | 500 | | `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 | ### Use an OpenAI compatible model for the LLM diff --git a/app/helpers/features.py b/app/helpers/features.py index 392f773..2df8fa1 100644 --- a/app/helpers/features.py +++ b/app/helpers/features.py @@ -59,7 +59,7 @@ async def vad_threshold() -> float: async def vad_silence_timeout_ms() -> int: return await _default( - default=400, + default=500, key="vad_silence_timeout_ms", type_res=int, ) @@ -67,7 +67,7 @@ async def vad_silence_timeout_ms() -> int: async def vad_cutoff_timeout_ms() -> int: return await _default( - default=600, + default=300, key="vad_cutoff_timeout_ms", type_res=int, ) diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep index 16e60eb..f7ffafe 100644 --- a/cicd/bicep/app.bicep +++ b/cicd/bicep/app.bicep @@ -907,8 +907,8 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@ recognition_retry_max: 2 recording_enabled: false slow_llm_for_chat: false - vad_cutoff_timeout_ms: 600 - vad_silence_timeout_ms: 400 + vad_cutoff_timeout_ms: 300 + vad_silence_timeout_ms: 500 vad_threshold: '0.5' }): { parent: configStore From 843fe28efbf95634589e67a46e73581c40c4b3ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 18:37:18 +0100 Subject: [PATCH 09/12] dev: Add log for recognition timeout --- app/helpers/call_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index 147ae43..00ae850 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -212,7 +212,7 @@ async def _response_callback(_retry: bool = False) -> None: try: await asyncio.wait_for(stt_complete_gate.wait(), timeout=0.05) except TimeoutError: - pass + logger.debug("Complete recognition timeout, using partial recognition") stt_text = " ".join(stt_buffer).strip() From f2281b965d773ac5f4564081740049bfe47c9842 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 18:37:33 +0100 Subject: [PATCH 10/12] quality: Code quality --- app/helpers/call_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/helpers/call_utils.py b/app/helpers/call_utils.py index 8d12c20..f09627b 100644 --- a/app/helpers/call_utils.py +++ b/app/helpers/call_utils.py @@ -727,12 +727,12 @@ async def _process_one(self, input_pcm: bytes) -> None: # Apply noise reduction reduced_signal = reduce_noise( # Input signal - clip_noise_stationary=False, sr=self._sample_rate, y=input_signal, # Performance n_fft=self._chunk_size, # Since the reference signal is already noise-reduced, we can assume it's stationary + clip_noise_stationary=False, # Noise is longer than the signal stationary=True, y_noise=self._bot_voice_buffer, # Output quality From 37de648f58c5e28149f2588a1a83d7afc0835b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 18:38:03 +0100 Subject: [PATCH 11/12] perf: Limit echo in the final audio 100% of background removal seems fine. --- app/helpers/call_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/app/helpers/call_utils.py b/app/helpers/call_utils.py index f09627b..499a18b 100644 --- a/app/helpers/call_utils.py +++ b/app/helpers/call_utils.py @@ -735,8 +735,6 @@ async def _process_one(self, input_pcm: bytes) -> None: clip_noise_stationary=False, # Noise is longer than the signal stationary=True, y_noise=self._bot_voice_buffer, - # Output quality - prop_decrease=0.75, # Reduce noise by 75% ) # Perform VAD test From e3f72d56e6bdf0fe8a19d93f11da18ce585d2e70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Wed, 11 Dec 2024 18:38:19 +0100 Subject: [PATCH 12/12] perf: Enhance echo detection --- app/helpers/call_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/helpers/call_utils.py b/app/helpers/call_utils.py index 499a18b..1076706 100644 --- a/app/helpers/call_utils.py +++ b/app/helpers/call_utils.py @@ -729,8 +729,8 @@ async def _process_one(self, input_pcm: bytes) -> None: # Input signal sr=self._sample_rate, y=input_signal, - # Performance - n_fft=self._chunk_size, + # Quality + n_fft=128, # Since the reference signal is already noise-reduced, we can assume it's stationary clip_noise_stationary=False, # Noise is longer than the signal stationary=True,