Skip to content

Commit

Permalink
Fix:speech recognition
Browse files Browse the repository at this point in the history
  • Loading branch information
jianchang512 committed Jun 10, 2024
1 parent 4cd2064 commit 13c782d
Show file tree
Hide file tree
Showing 11 changed files with 115 additions and 86 deletions.
4 changes: 2 additions & 2 deletions videotrans/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-

VERSION="v1.89"
VERSION_NUM=110789
VERSION="v1.90"
VERSION_NUM=110790
4 changes: 3 additions & 1 deletion videotrans/configure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def parse_init():
"separate_sec":600,
"audio_rate":1.5,
"video_rate":20,
"initial_prompt_zh":"",
"initial_prompt_zh":"Please break sentences correctly and retain punctuation",
"fontsize":16,
"fontname":"黑体",
"fontcolor":"&HFFFFFF",
Expand All @@ -82,6 +82,8 @@ def parse_init():
"backaudio_volume":0.5,
"overall_silence":2100,
"overall_maxsecs":3,
"overall_threshold":0.5,
"overall_speech_pad_ms":100,
"remove_srt_silence":False,
"remove_silence":True,
"remove_white_ms":100,
Expand Down
5 changes: 1 addition & 4 deletions videotrans/mainwin/secwin.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ def open_url(self, title):
elif title == 'issue':
webbrowser.open_new_tab("https://github.com/jianchang512/pyvideotrans/issues")
elif title == 'discord':
webbrowser.open_new_tab("https://discord.gg/4WrxqBTSn8")
webbrowser.open_new_tab("https://discord.gg/y9gUweVCCJ")
elif title == 'models':
webbrowser.open_new_tab("https://github.com/jianchang512/stt/releases/tag/0.0")
elif title == 'dll':
Expand Down Expand Up @@ -604,9 +604,6 @@ def open_url(self, title):
本软件的所有解释权均属于开发者。谨请用户在理解、同意、遵守本免责声明的前提下使用本软件。
""")

elif title == 'aihelp':
webbrowser.open_new_tab("https://www.coze.cn/store/bot/7358853334134112296?panel=1")

# 工具箱
def open_toolbox(self, index=0, is_hide=True):
try:
Expand Down
1 change: 0 additions & 1 deletion videotrans/mainwin/spwin.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,6 @@ def bind_action(self):
self.action_cuda.triggered.connect(lambda: self.util.open_url('cuda'))
self.action_online.triggered.connect(lambda: self.util.open_url('online'))
self.action_website.triggered.connect(lambda: self.util.open_url('website'))
self.action_aihelp.triggered.connect(lambda: self.util.open_url('aihelp'))
self.action_blog.triggered.connect(lambda: self.util.open_url('blog'))
self.statusLabel.clicked.connect(lambda: self.util.open_url('help'))
self.action_issue.triggered.connect(lambda: self.util.open_url('issue'))
Expand Down
7 changes: 4 additions & 3 deletions videotrans/recognition/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,13 @@ def recogn(*,
vad_filter=bool(config.settings['vad']),
vad_parameters=dict(
min_silence_duration_ms=config.settings['overall_silence'],
max_speech_duration_s=config.settings['overall_maxsecs']
max_speech_duration_s=config.settings['overall_maxsecs'],
threshold=config.settings['overall_threshold'],
speech_pad_ms=config.settings['overall_speech_pad_ms']
),
word_timestamps=True,
language=detect_language,
initial_prompt=None if detect_language != 'zh' else config.settings[
'initial_prompt_zh'])
initial_prompt=config.settings['initial_prompt_zh'])

# 保留原始语言的字幕
raw_subtitles = []
Expand Down
16 changes: 12 additions & 4 deletions videotrans/recognition/avg.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,19 @@ def recogn(*,
text = ""
try:
segments, _ = model.transcribe(chunk_filename,
beam_size=5,
best_of=5,
condition_on_previous_text=True,
beam_size=config.settings['beam_size'],
best_of=config.settings['best_of'],
condition_on_previous_text=config.settings['condition_on_previous_text'],
temperature=0 if config.settings['temperature'] == 0 else [0.0, 0.2, 0.4,0.6, 0.8, 1.0],
vad_filter=bool(config.settings['vad']),
vad_parameters=dict(
min_silence_duration_ms=config.settings['overall_silence'],
max_speech_duration_s=config.settings['overall_maxsecs'],
threshold=config.settings['overall_threshold'],
speech_pad_ms=config.settings['overall_speech_pad_ms']
),
language=detect_language,
initial_prompt=None if detect_language != 'zh' else config.settings['initial_prompt_zh'], )
initial_prompt=config.settings['initial_prompt_zh'], )
for t in segments:
text += t.text + " "
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion videotrans/recognition/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def recogn(*,
try:
tr = model.transcribe(chunk_filename,
language=detect_language,
initial_prompt=None if detect_language != 'zh' else config.settings['initial_prompt_zh'],
initial_prompt=config.settings['initial_prompt_zh'],
condition_on_previous_text=config.settings['condition_on_previous_text']
)
for t in tr['segments']:
Expand Down
16 changes: 9 additions & 7 deletions videotrans/recognition/yuxian.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


# split audio by silence
def shorten_voice(normalized_sound, max_interval=60000):
def shorten_voice(normalized_sound, max_interval=300000):
normalized_sound = tools.match_target_amplitude(normalized_sound, -20.0)
nonsilent_data = []
audio_chunks = detect_nonsilent(normalized_sound, min_silence_len=int(config.settings['voice_silence']),
Expand All @@ -36,13 +36,13 @@ def recogn(*,
detect_language=None,
audio_file=None,
cache_folder=None,
model_name="base",
model_name="tiny",
set_p=True,
inst=None,
is_cuda=None):
if config.exit_soft or (config.current_status != 'ing' and config.box_recogn != 'ing'):
return False
if set_p:
if set_p:
tools.set_process(config.transobj['fengeyinpinshuju'], btnkey=inst.init['btnkey'] if inst else "")
noextname = os.path.basename(audio_file)
tmp_path = f'{cache_folder}/{noextname}_tmp'
Expand Down Expand Up @@ -110,19 +110,21 @@ def recogn(*,
vad_filter=bool(config.settings['vad']),
vad_parameters=dict(
min_silence_duration_ms=config.settings['overall_silence'],
max_speech_duration_s=config.settings['overall_maxsecs']
max_speech_duration_s=config.settings['overall_maxsecs'],
threshold=config.settings['overall_threshold'],
speech_pad_ms=config.settings['overall_speech_pad_ms']
),
word_timestamps=True,
language=detect_language,
initial_prompt=None if detect_language != 'zh' else config.settings['initial_prompt_zh'], )
initial_prompt=config.settings['initial_prompt_zh'])
for t in segments:
if detect_language == 'zh' and t.text == config.settings['initial_prompt_zh']:
if t.text == config.settings['initial_prompt_zh']:
continue
start_time, end_time, buffered = duration
text = t.text
text = f"{text.capitalize()}. ".replace(''', "'")
text = re.sub(r'&#\d+;', '', text).strip().strip('.')
if detect_language == 'zh' and text == config.settings['initial_prompt_zh']:
if text == config.settings['initial_prompt_zh']:
continue
if not text or re.match(r'^[,。、?‘’“”;:({}【】):;"\'\s \d`!@#$%^&*()_+=.,?/\\-]*$', text):
continue
Expand Down
33 changes: 19 additions & 14 deletions videotrans/set.ini
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,24 @@ force_edit_srt=true
; ###############语句分割相关##################################
; statement segmentation related ##################################

;用于 预先分割 和 整体识别 时,作为切割依据的最小静音片段ms,默认200ms 以及最大句子时长3s
;faster-whisper字幕整体识别模式时启用自定义静音分割片段,true=启用,显存不足时,可以设为false禁用
;Enable custom mute segmentation when subtitles are in overall recognition mode, true=enable, can be set to false to disable when video memory is insufficient.
vad=true

;用于 faster-whisper 时 VAD选项设置作为切割依据的最小静音片段ms,默认250ms 以及最大句子时长6s
;The minimum silent segmentation ms, default 200ms, and the maximum sentence length 3s are used for pre-segmentation and overall recognition as the basis for segmentation.
overall_silence=200
overall_maxsecs=3
overall_silence=250
overall_maxsecs=6
overall_threshold=0.5
overall_speech_pad_ms=100


;用于均等分割时,作为切割依据的最小静音片段ms,默认200ms,即只有大于等于200ms的静音处才分割
;用于均等分割时和openai模式时,作为切割依据的最小静音片段ms,默认200ms,即只有大于等于200ms的静音处才分割
; used for equal segmentation, as the basis for cutting the minimum silence segment ms, the default 200ms, that is, only greater than or equal to 200ms silence at the split
voice_silence=200
;用于均等分割时的每个切片时长 秒,默认 6s,即每个字幕时长大约都是6s
voice_silence=250
;用于均等分割时的每个切片时长 秒,默认 10s,即每个字幕时长大约都是10s
;seconds per slice for equalization, default 6s, i.e. each subtitle is about 6s.
interval_split=6
interval_split=10


;################翻译配音速度#############################
Expand Down Expand Up @@ -114,9 +121,9 @@ loop_backaudio=false
; cuda data type when recognizing subtitles from video, int8=consumes fewer resources, faster, lower precision, float32=consumes more resources, slower, higher precision, int8_float16=device of choice
cuda_com_type=float32

;中文语言的视频时,用于识别的提示词,可解决简体识别为繁体问题。但注意,有可能直接会将提示词作为识别结果返回
;The prompt words used to recognize videos in Chinese language can solve the problem of recognizing simplified Chinese as traditional Chinese. But note that there is a possibility that the prompt word will be returned directly as the result of the recognition.
initial_prompt_zh=
;发送给whisper模型的提示词
;Cue words sent to the whisper model.
initial_prompt_zh=Please break sentences correctly and retain punctuation.

;字幕识别时,cpu进程
;cpu process during subtitle recognition
Expand All @@ -131,13 +138,11 @@ whisper_worker=1
beam_size=5
best_of=5

;faster-whisper字幕整体识别模式时启用自定义静音分割片段,true=启用,显存不足时,可以设为false禁用
;Enable custom mute segmentation when subtitles are in overall recognition mode, true=enable, can be set to false to disable when video memory is insufficient.
vad=true


;0=占用更少GPU资源但效果略差,1=占用更多GPU资源同时效果更好
;0 = less GPU resources but slightly worse results, 1 = more GPU resources and better results at the same time
temperature=1
temperature=0

;同 temperature, true=占用更多GPU效果更好,false=占用更少GPU效果略差
; same as temperature, true=better with more GPUs, false=slightly worse with fewer GPUs
Expand Down
Loading

0 comments on commit 13c782d

Please sign in to comment.