Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Nov 26, 2024
2 parents 5ca7ac2 + 420f2e2 commit a284bb2
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 65 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,8 @@ Conversation options are represented as features. They can be configured from Ap
| `recognition_retry_max` | The maximum number of retries for voice recognition. | `int` | 2 |
| `recording_enabled` | Whether call recording is enabled. | `bool` | false |
| `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false |
| `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 500 |
| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in seconds. | `int` | 400 |
| `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 400 |
| `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 |

### Use an OpenAI compatible model for the LLM
Expand Down
79 changes: 56 additions & 23 deletions app/helpers/call_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from azure.communication.callautomation.aio import CallAutomationClient
from openai import APIError
from pydub import AudioSegment
from pydub.effects import high_pass_filter, low_pass_filter

from app.helpers.call_utils import (
handle_clear_queue,
Expand All @@ -24,6 +25,7 @@
from app.helpers.features import (
answer_hard_timeout_sec,
answer_soft_timeout_sec,
vad_cutoff_timeout_ms,
vad_silence_timeout_ms,
vad_threshold,
)
Expand Down Expand Up @@ -65,6 +67,7 @@ async def load_llm_chat( # noqa: PLR0913
# Init language recognition
speech_token = await (await token("https://cognitiveservices.azure.com/.default"))()
recognizer_buffer: list[str] = []
recognizer_store_next_recognition = False
recognizer_lock = asyncio.Event()
recognizer_stream = PushAudioInputStream(
stream_format=AudioStreamFormat(
Expand All @@ -86,25 +89,44 @@ async def load_llm_chat( # noqa: PLR0913

def _handle_partial_recognition(event: SpeechRecognitionEventArgs) -> None:
text = event.result.text

# Skip if no text
if not text:
return

# Init buffer if empty
if not recognizer_buffer:
recognizer_buffer.append("")

# Replace last element by this update
recognizer_buffer[-1] = text
logger.debug("Partial recognition: %s", recognizer_buffer)

# Lock the recognition until the audio stream is ready
recognizer_lock.set()

def _handle_complete_recognition(event: SpeechRecognitionEventArgs) -> None:
text = event.result.text

# Skip if no text
if not text:
return

# Replace last element by this update
recognizer_buffer[-1] = text

# If recognition requires to be stored, add it to the call history
nonlocal recognizer_store_next_recognition
if recognizer_store_next_recognition:
recognizer_store_next_recognition = False
logger.info("Voice stored: %s", recognizer_buffer)
call.messages.append(
MessageModel(
content=" ".join(recognizer_buffer),
persona=MessagePersonaEnum.HUMAN,
)
)

# Add a new element to the buffer, thus the next partial recognition will be in a new element
recognizer_buffer.append("")
logger.debug("Complete recognition: %s", recognizer_buffer)
Expand Down Expand Up @@ -155,14 +177,10 @@ async def _response_callback() -> None:
if not recognizer_buffer or recognizer_buffer[-1] == "":
return

# Add recognition to the call history
logger.info("Voice recognition: %s", recognizer_buffer)
call.messages.append(
MessageModel(
content=" ".join(recognizer_buffer),
persona=MessagePersonaEnum.HUMAN,
)
)
# Set recognition to be added to the call history
logger.info("Voice recognized: %s", recognizer_buffer)
nonlocal recognizer_store_next_recognition
recognizer_store_next_recognition = True

# Add recognition to the call history
nonlocal last_response
Expand All @@ -172,6 +190,7 @@ async def _response_callback() -> None:
client=automation_client,
post_callback=post_callback,
scheduler=scheduler,
text=" ".join(recognizer_buffer),
training_callback=training_callback,
)
)
Expand All @@ -197,6 +216,7 @@ async def _out_answer( # noqa: PLR0915
client: CallAutomationClient,
post_callback: Callable[[CallStateModel], Awaitable[None]],
scheduler: aiojobs.Scheduler,
text: str,
training_callback: Callable[[CallStateModel], Awaitable[None]],
_iterations_remaining: int = 3,
) -> CallStateModel:
Expand Down Expand Up @@ -241,8 +261,9 @@ async def _tts_callback(text: str, style: MessageStyleEnum) -> None:
call=call,
client=client,
post_callback=post_callback,
use_tools=_iterations_remaining > 0,
text=text,
tts_callback=_tts_callback,
use_tools=_iterations_remaining > 0,
)
)

Expand Down Expand Up @@ -342,6 +363,7 @@ def _clear_tasks() -> None:
client=client,
post_callback=post_callback,
scheduler=scheduler,
text=text,
training_callback=training_callback,
_iterations_remaining=_iterations_remaining - 1,
)
Expand All @@ -352,6 +374,7 @@ def _clear_tasks() -> None:
client=client,
post_callback=post_callback,
scheduler=scheduler,
text=text,
training_callback=training_callback,
_iterations_remaining=_iterations_remaining - 1,
) # Recursive chat (like for for retry or tools)
Expand All @@ -368,6 +391,7 @@ async def _execute_llm_chat( # noqa: PLR0911, PLR0912, PLR0915
call: CallStateModel,
client: CallAutomationClient,
post_callback: Callable[[CallStateModel], Awaitable[None]],
text: str,
tts_callback: Callable[[str, MessageStyleEnum], Awaitable[None]],
use_tools: bool,
) -> tuple[bool, bool, CallStateModel]:
Expand Down Expand Up @@ -437,14 +461,23 @@ async def _content_callback(
tools = await plugins.to_openai()
logger.debug("Tools: %s", tools)

# Add user message in a temporary current context
call_copy = call.model_copy()
call_copy.messages.append(
MessageModel(
content=text,
persona=MessagePersonaEnum.HUMAN,
)
)

# Execute LLM inference
maximum_tokens_reached = False
content_buffer_pointer = 0
tool_calls_buffer: dict[int, MessageToolModel] = {}
try:
async for delta in completion_stream(
max_tokens=160, # Lowest possible value for 90% of the cases, if not sufficient, retry will be triggered, 100 tokens ~= 75 words, 20 words ~= 1 sentence, 6 sentences ~= 160 tokens
messages=call.messages,
messages=call_copy.messages,
system=system,
tools=tools,
):
Expand Down Expand Up @@ -554,28 +587,21 @@ async def _in_audio( # noqa: PLR0913
clear_tts_task: asyncio.Task | None = None
flush_task: asyncio.Task | None = None

# Init VAD parameters
rms_threshold = await vad_threshold()
sample_width = bits_per_sample // 8
silence_duration_ms = await vad_silence_timeout_ms()

async def _flush_callback() -> None:
"""
Flush the audio buffer if no audio is detected for a while.
"""
nonlocal clear_tts_task

# Wait for the timeout
await asyncio.sleep(silence_duration_ms / 1000)
await asyncio.sleep(await vad_silence_timeout_ms() / 1000)

# Cancel the TTS clear task if any
if clear_tts_task:
clear_tts_task.cancel()
clear_tts_task = None

logger.debug(
"Timeout triggered after %ims, flushing audio buffer", silence_duration_ms
)
logger.debug("Timeout triggered, flushing audio buffer")

# Commit the buffer
await response_callback()
Expand All @@ -586,8 +612,8 @@ async def _clear_tts_callback() -> None:
Start is the index of the buffer where the TTS was triggered.
"""
# Wait 200ms before clearing the TTS queue
await asyncio.sleep(0.2)
# Wait before clearing the TTS queue
await asyncio.sleep(await vad_cutoff_timeout_ms() / 1000)

logger.debug("Voice detected, cancelling TTS")

Expand All @@ -604,19 +630,26 @@ async def _clear_tts_callback() -> None:
channels=channels,
data=in_chunck,
frame_rate=sample_rate,
sample_width=sample_width,
sample_width=bits_per_sample // 8,
)

# Confirm ASAP that the event is processed
in_stream.task_done()

# Apply high-pass and low-pass filters in a simple attempt to reduce noise
in_audio = high_pass_filter(in_audio, 200)
in_audio = low_pass_filter(in_audio, 3000)

# Always add the audio to the buffer
assert isinstance(in_audio.raw_data, bytes)
out_stream.write(in_audio.raw_data)

# Get the relative dB, silences shoudl be at 1 to 5% of the max, so 0.1 to 0.5 of the threshold
in_empty = False
if min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1) < rms_threshold:
if (
min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1)
< await vad_threshold()
):
in_empty = True
# Start timeout if not already started and VAD already triggered
if not flush_task:
Expand Down
6 changes: 5 additions & 1 deletion app/helpers/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ async def callback_timeout_hour() -> int:


async def vad_silence_timeout_ms() -> int:
return await _get(key="vad_silence_timeout_ms", type_res=int) or 500
return await _get(key="vad_silence_timeout_ms", type_res=int) or 400


async def vad_cutoff_timeout_ms() -> int:
return await _get(key="vad_cutoff_timeout_ms", type_res=int) or 400


async def vad_threshold() -> float:
Expand Down
43 changes: 22 additions & 21 deletions cicd/bicep/app.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -565,72 +565,72 @@ resource contentfilter 'Microsoft.CognitiveServices/accounts/raiPolicies@2024-06
}
// Prompt
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'hate'
severityThreshold: 'High'
source: 'Prompt'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'sexual'
severityThreshold: 'High'
source: 'Prompt'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'selfharm'
severityThreshold: 'High'
source: 'Prompt'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'violence'
severityThreshold: 'High'
source: 'Prompt'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'profanity'
severityThreshold: 'High'
source: 'Prompt'
}
// Completion
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'hate'
severityThreshold: 'High'
source: 'Completion'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'sexual'
severityThreshold: 'High'
source: 'Completion'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'selfharm'
severityThreshold: 'High'
source: 'Completion'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'violence'
severityThreshold: 'High'
source: 'Completion'
}
{
blocking: !promptContentFilter
enabled: !promptContentFilter
blocking: promptContentFilter
enabled: promptContentFilter
name: 'profanity'
severityThreshold: 'High'
source: 'Completion'
Expand Down Expand Up @@ -905,7 +905,8 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@
recognition_retry_max: 2
recording_enabled: false
slow_llm_for_chat: false
vad_silence_timeout_ms: 500
vad_cutoff_timeout_ms: 400
vad_silence_timeout_ms: 400
vad_threshold: '0.5'
}): {
parent: configStore
Expand Down
Loading

0 comments on commit a284bb2

Please sign in to comment.