From 0497c8d62efe90d380d7379bab8df06354dc2a68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Thu, 5 Dec 2024 09:45:51 +0100 Subject: [PATCH] perf: Enhance VAD --- README.md | 1 - app/helpers/call_llm.py | 24 ++++++++++++++++-------- app/helpers/features.py | 4 ---- cicd/bicep/app.bicep | 1 - pyproject.toml | 1 + uv.lock | 8 ++++++++ 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 2587d093..7fc77929 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,6 @@ Conversation options are represented as features. They can be configured from Ap | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false | | `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in seconds. | `int` | 400 | | `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 400 | -| `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 | ### Use an OpenAI compatible model for the LLM diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py index d597b04a..0bb8e8c0 100644 --- a/app/helpers/call_llm.py +++ b/app/helpers/call_llm.py @@ -13,7 +13,11 @@ from azure.communication.callautomation.aio import CallAutomationClient from openai import APIError from pydub import AudioSegment -from pydub.effects import high_pass_filter, low_pass_filter +from pydub.effects import ( + high_pass_filter, + low_pass_filter, +) +from webrtcvad import Vad from app.helpers.call_utils import ( handle_clear_queue, @@ -27,7 +31,6 @@ answer_soft_timeout_sec, vad_cutoff_timeout_ms, vad_silence_timeout_ms, - vad_threshold, ) from app.helpers.identity import token from app.helpers.llm_tools import DefaultPlugin @@ -586,6 +589,11 @@ async def _in_audio( # noqa: PLR0913 ) -> None: clear_tts_task: asyncio.Task | None = None flush_task: asyncio.Task | None = None + vad = Vad( + # Aggressiveness mode (0, 1, 2, or 3) + # Sets the VAD operating mode. A more aggressive (higher mode) VAD is more restrictive in reporting speech. Put in other words the probability of being speech when the VAD returns 1 is increased with increasing mode. As a consequence also the missed detection rate goes up. + mode=3, + ) async def _flush_callback() -> None: """ @@ -637,18 +645,18 @@ async def _clear_tts_callback() -> None: in_stream.task_done() # Apply high-pass and low-pass filters in a simple attempt to reduce noise - in_audio = high_pass_filter(in_audio, 200) - in_audio = low_pass_filter(in_audio, 3000) + in_audio = high_pass_filter(seg=in_audio, cutoff=85) + in_audio = low_pass_filter(seg=in_audio, cutoff=3000) # Always add the audio to the buffer assert isinstance(in_audio.raw_data, bytes) out_stream.write(in_audio.raw_data) - # Get the relative dB, silences shoudl be at 1 to 5% of the max, so 0.1 to 0.5 of the threshold + # Use WebRTC VAD algorithm to detect voice in_empty = False - if ( - min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1) - < await vad_threshold() + if not vad.is_speech( + buf=in_audio.raw_data, + sample_rate=in_audio.frame_rate, ): in_empty = True # Start timeout if not already started and VAD already triggered diff --git a/app/helpers/features.py b/app/helpers/features.py index 7f6ceaa0..e029657d 100644 --- a/app/helpers/features.py +++ b/app/helpers/features.py @@ -36,10 +36,6 @@ async def vad_cutoff_timeout_ms() -> int: return await _get(key="vad_cutoff_timeout_ms", type_res=int) or 400 -async def vad_threshold() -> float: - return await _get(key="vad_threshold", type_res=float) or 0.5 - - async def recording_enabled() -> bool: return await _get(key="recording_enabled", type_res=bool) or False diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep index f165425f..882666f9 100644 --- a/cicd/bicep/app.bicep +++ b/cicd/bicep/app.bicep @@ -907,7 +907,6 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@ slow_llm_for_chat: false vad_cutoff_timeout_ms: 400 vad_silence_timeout_ms: 400 - vad_threshold: '0.5' }): { parent: configStore name: item.key diff --git a/pyproject.toml b/pyproject.toml index 4d4f601d..199f01cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "twilio~=9.3", # Twilio SDK, used for SMS "typing-extensions~=4.12", # Typing extensions for Python 3.6+ "uvicorn[standard]~=0.32", # Application middleware + "webrtcvad~=2.0", # Voice activity detection ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index c4fce423..6b49362d 100644 --- a/uv.lock +++ b/uv.lock @@ -507,6 +507,7 @@ dependencies = [ { name = "twilio" }, { name = "typing-extensions" }, { name = "uvicorn", extra = ["standard"] }, + { name = "webrtcvad" }, ] [package.optional-dependencies] @@ -578,6 +579,7 @@ requires-dist = [ { name = "twilio", specifier = "~=9.3" }, { name = "typing-extensions", specifier = "~=4.12" }, { name = "uvicorn", extras = ["standard"], specifier = "~=0.32" }, + { name = "webrtcvad", specifier = "~=2.0" }, ] [[package]] @@ -3018,6 +3020,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774 }, ] +[[package]] +name = "webrtcvad" +version = "2.0.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/89/34/e2de2d97f3288512b9ea56f92e7452f8207eb5a0096500badf9dfd48f5e6/webrtcvad-2.0.10.tar.gz", hash = "sha256:f1bed2fb25b63fb7b1a55d64090c993c9c9167b28485ae0bcdd81cf6ede96aea", size = 66156 } + [[package]] name = "websockets" version = "14.1"