From 0497c8d62efe90d380d7379bab8df06354dc2a68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Thu, 5 Dec 2024 09:45:51 +0100
Subject: [PATCH] perf: Enhance VAD

---
 README.md               |  1 -
 app/helpers/call_llm.py | 24 ++++++++++++++++--------
 app/helpers/features.py |  4 ----
 cicd/bicep/app.bicep    |  1 -
 pyproject.toml          |  1 +
 uv.lock                 |  8 ++++++++
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 2587d093..7fc77929 100644
--- a/README.md
+++ b/README.md
@@ -632,7 +632,6 @@ Conversation options are represented as features. They can be configured from Ap
 | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false |
 | `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in seconds. | `int` | 400 |
 | `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 400 |
-| `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 |
 
 ### Use an OpenAI compatible model for the LLM
 
diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py
index d597b04a..0bb8e8c0 100644
--- a/app/helpers/call_llm.py
+++ b/app/helpers/call_llm.py
@@ -13,7 +13,11 @@
 from azure.communication.callautomation.aio import CallAutomationClient
 from openai import APIError
 from pydub import AudioSegment
-from pydub.effects import high_pass_filter, low_pass_filter
+from pydub.effects import (
+    high_pass_filter,
+    low_pass_filter,
+)
+from webrtcvad import Vad
 
 from app.helpers.call_utils import (
     handle_clear_queue,
@@ -27,7 +31,6 @@
     answer_soft_timeout_sec,
     vad_cutoff_timeout_ms,
     vad_silence_timeout_ms,
-    vad_threshold,
 )
 from app.helpers.identity import token
 from app.helpers.llm_tools import DefaultPlugin
@@ -586,6 +589,11 @@ async def _in_audio(  # noqa: PLR0913
 ) -> None:
     clear_tts_task: asyncio.Task | None = None
     flush_task: asyncio.Task | None = None
+    vad = Vad(
+        # Aggressiveness mode (0, 1, 2, or 3)
+        # Sets the VAD operating mode. A more aggressive (higher mode) VAD is more restrictive in reporting speech. Put in other words the probability of being speech when the VAD returns 1 is increased with increasing mode. As a consequence also the missed detection rate goes up.
+        mode=3,
+    )
 
     async def _flush_callback() -> None:
         """
@@ -637,18 +645,18 @@ async def _clear_tts_callback() -> None:
         in_stream.task_done()
 
         # Apply high-pass and low-pass filters in a simple attempt to reduce noise
-        in_audio = high_pass_filter(in_audio, 200)
-        in_audio = low_pass_filter(in_audio, 3000)
+        in_audio = high_pass_filter(seg=in_audio, cutoff=85)
+        in_audio = low_pass_filter(seg=in_audio, cutoff=3000)
 
         # Always add the audio to the buffer
         assert isinstance(in_audio.raw_data, bytes)
         out_stream.write(in_audio.raw_data)
 
-        # Get the relative dB, silences shoudl be at 1 to 5% of the max, so 0.1 to 0.5 of the threshold
+        # Use WebRTC VAD algorithm to detect voice
         in_empty = False
-        if (
-            min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1)
-            < await vad_threshold()
+        if not vad.is_speech(
+            buf=in_audio.raw_data,
+            sample_rate=in_audio.frame_rate,
         ):
             in_empty = True
             # Start timeout if not already started and VAD already triggered
diff --git a/app/helpers/features.py b/app/helpers/features.py
index 7f6ceaa0..e029657d 100644
--- a/app/helpers/features.py
+++ b/app/helpers/features.py
@@ -36,10 +36,6 @@ async def vad_cutoff_timeout_ms() -> int:
     return await _get(key="vad_cutoff_timeout_ms", type_res=int) or 400
 
 
-async def vad_threshold() -> float:
-    return await _get(key="vad_threshold", type_res=float) or 0.5
-
-
 async def recording_enabled() -> bool:
     return await _get(key="recording_enabled", type_res=bool) or False
 
diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep
index f165425f..882666f9 100644
--- a/cicd/bicep/app.bicep
+++ b/cicd/bicep/app.bicep
@@ -907,7 +907,6 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@
     slow_llm_for_chat: false
     vad_cutoff_timeout_ms: 400
     vad_silence_timeout_ms: 400
-    vad_threshold: '0.5'
   }): {
     parent: configStore
     name: item.key
diff --git a/pyproject.toml b/pyproject.toml
index 4d4f601d..199f01cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies = [
   "twilio~=9.3",  # Twilio SDK, used for SMS
   "typing-extensions~=4.12",  # Typing extensions for Python 3.6+
   "uvicorn[standard]~=0.32",  # Application middleware
+  "webrtcvad~=2.0",  # Voice activity detection
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index c4fce423..6b49362d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -507,6 +507,7 @@ dependencies = [
     { name = "twilio" },
     { name = "typing-extensions" },
     { name = "uvicorn", extra = ["standard"] },
+    { name = "webrtcvad" },
 ]
 
 [package.optional-dependencies]
@@ -578,6 +579,7 @@ requires-dist = [
     { name = "twilio", specifier = "~=9.3" },
     { name = "typing-extensions", specifier = "~=4.12" },
     { name = "uvicorn", extras = ["standard"], specifier = "~=0.32" },
+    { name = "webrtcvad", specifier = "~=2.0" },
 ]
 
 [[package]]
@@ -3018,6 +3020,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774 },
 ]
 
+[[package]]
+name = "webrtcvad"
+version = "2.0.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/89/34/e2de2d97f3288512b9ea56f92e7452f8207eb5a0096500badf9dfd48f5e6/webrtcvad-2.0.10.tar.gz", hash = "sha256:f1bed2fb25b63fb7b1a55d64090c993c9c9167b28485ae0bcdd81cf6ede96aea", size = 66156 }
+
 [[package]]
 name = "websockets"
 version = "14.1"