Fix:speech recognition

millet0328 · Jun 10, 2024 · 13c782d · 13c782d
1 parent 4cd2064
commit 13c782d
Show file tree

Hide file tree

Showing 11 changed files with 115 additions and 86 deletions.
diff --git a/videotrans/__init__.py b/videotrans/__init__.py
@@ -1,4 +1,4 @@
 # -*- coding: utf-8 -*-
 
-VERSION="v1.89"
-VERSION_NUM=110789
+VERSION="v1.90"
+VERSION_NUM=110790
diff --git a/videotrans/configure/config.py b/videotrans/configure/config.py
@@ -69,7 +69,7 @@ def parse_init():
         "separate_sec":600,
         "audio_rate":1.5,
         "video_rate":20,
-        "initial_prompt_zh":"",
+        "initial_prompt_zh":"Please break sentences correctly and retain punctuation",
         "fontsize":16,
         "fontname":"黑体",
         "fontcolor":"&HFFFFFF",
@@ -82,6 +82,8 @@ def parse_init():
         "backaudio_volume":0.5,
         "overall_silence":2100,
         "overall_maxsecs":3,
+        "overall_threshold":0.5,
+        "overall_speech_pad_ms":100,
         "remove_srt_silence":False,
         "remove_silence":True,
         "remove_white_ms":100,

diff --git a/videotrans/mainwin/secwin.py b/videotrans/mainwin/secwin.py
@@ -555,7 +555,7 @@ def open_url(self, title):
         elif title == 'issue':
             webbrowser.open_new_tab("https://github.com/jianchang512/pyvideotrans/issues")
         elif title == 'discord':
-            webbrowser.open_new_tab("https://discord.gg/4WrxqBTSn8")
+            webbrowser.open_new_tab("https://discord.gg/y9gUweVCCJ")
         elif title == 'models':
             webbrowser.open_new_tab("https://github.com/jianchang512/stt/releases/tag/0.0")
         elif title == 'dll':
@@ -604,9 +604,6 @@ def open_url(self, title):
 本软件的所有解释权均属于开发者。谨请用户在理解、同意、遵守本免责声明的前提下使用本软件。                
             """)
 
-        elif title == 'aihelp':
-            webbrowser.open_new_tab("https://www.coze.cn/store/bot/7358853334134112296?panel=1")
-
     # 工具箱
     def open_toolbox(self, index=0, is_hide=True):
         try:

diff --git a/videotrans/mainwin/spwin.py b/videotrans/mainwin/spwin.py
@@ -306,7 +306,6 @@ def bind_action(self):
         self.action_cuda.triggered.connect(lambda: self.util.open_url('cuda'))
         self.action_online.triggered.connect(lambda: self.util.open_url('online'))
         self.action_website.triggered.connect(lambda: self.util.open_url('website'))
-        self.action_aihelp.triggered.connect(lambda: self.util.open_url('aihelp'))
         self.action_blog.triggered.connect(lambda: self.util.open_url('blog'))
         self.statusLabel.clicked.connect(lambda: self.util.open_url('help'))
         self.action_issue.triggered.connect(lambda: self.util.open_url('issue'))

diff --git a/videotrans/recognition/all.py b/videotrans/recognition/all.py
@@ -59,12 +59,13 @@ def recogn(*,
                                           vad_filter=bool(config.settings['vad']),
                                           vad_parameters=dict(
                                               min_silence_duration_ms=config.settings['overall_silence'],
-                                              max_speech_duration_s=config.settings['overall_maxsecs']
+                                              max_speech_duration_s=config.settings['overall_maxsecs'],
+                                              threshold=config.settings['overall_threshold'],
+                                              speech_pad_ms=config.settings['overall_speech_pad_ms']
                                           ),
                                           word_timestamps=True,
                                           language=detect_language,
-                                          initial_prompt=None if detect_language != 'zh' else config.settings[
-                                              'initial_prompt_zh'])
+                                          initial_prompt=config.settings['initial_prompt_zh'])
 
         # 保留原始语言的字幕
         raw_subtitles = []

diff --git a/videotrans/recognition/avg.py b/videotrans/recognition/avg.py
@@ -106,11 +106,19 @@ def recogn(*,
         text = ""
         try:
             segments, _ = model.transcribe(chunk_filename,
-                                           beam_size=5,
-                                           best_of=5,
-                                           condition_on_previous_text=True,
+                                          beam_size=config.settings['beam_size'],
+                                          best_of=config.settings['best_of'],
+                                          condition_on_previous_text=config.settings['condition_on_previous_text'],
+                                           temperature=0 if config.settings['temperature'] == 0 else [0.0, 0.2, 0.4,0.6, 0.8, 1.0],
+                                           vad_filter=bool(config.settings['vad']),
+                                           vad_parameters=dict(
+                                               min_silence_duration_ms=config.settings['overall_silence'],
+                                               max_speech_duration_s=config.settings['overall_maxsecs'],
+                                               threshold=config.settings['overall_threshold'],
+                                               speech_pad_ms=config.settings['overall_speech_pad_ms']
+                                           ),
                                            language=detect_language,
-                                           initial_prompt=None if detect_language != 'zh' else config.settings['initial_prompt_zh'], )
+                                           initial_prompt=config.settings['initial_prompt_zh'], )
             for t in segments:
                 text += t.text + " "
         except Exception as e:

diff --git a/videotrans/recognition/openai.py b/videotrans/recognition/openai.py
@@ -91,7 +91,7 @@ def recogn(*,
         try:
             tr = model.transcribe(chunk_filename,
                                   language=detect_language,
-                                  initial_prompt=None if detect_language != 'zh' else config.settings['initial_prompt_zh'],
+                                  initial_prompt=config.settings['initial_prompt_zh'],
                                   condition_on_previous_text=config.settings['condition_on_previous_text']
             )
             for t in tr['segments']:

diff --git a/videotrans/recognition/yuxian.py b/videotrans/recognition/yuxian.py
@@ -13,7 +13,7 @@
 
 
 # split audio by silence
-def shorten_voice(normalized_sound, max_interval=60000):
+def shorten_voice(normalized_sound, max_interval=300000):
     normalized_sound = tools.match_target_amplitude(normalized_sound, -20.0)
     nonsilent_data = []
     audio_chunks = detect_nonsilent(normalized_sound, min_silence_len=int(config.settings['voice_silence']),
@@ -36,13 +36,13 @@ def recogn(*,
            detect_language=None,
            audio_file=None,
            cache_folder=None,
-           model_name="base",
+           model_name="tiny",
            set_p=True,
            inst=None,
            is_cuda=None):
     if config.exit_soft or (config.current_status != 'ing' and config.box_recogn != 'ing'):
         return False
-    if set_p:        
+    if set_p:
         tools.set_process(config.transobj['fengeyinpinshuju'], btnkey=inst.init['btnkey'] if inst else "")
     noextname = os.path.basename(audio_file)
     tmp_path = f'{cache_folder}/{noextname}_tmp'
@@ -110,19 +110,21 @@ def recogn(*,
                                            vad_filter=bool(config.settings['vad']),
                                            vad_parameters=dict(
                                                min_silence_duration_ms=config.settings['overall_silence'],
-                                               max_speech_duration_s=config.settings['overall_maxsecs']
+                                               max_speech_duration_s=config.settings['overall_maxsecs'],
+                                               threshold=config.settings['overall_threshold'],
+                                                speech_pad_ms=config.settings['overall_speech_pad_ms']
                                            ),
                                            word_timestamps=True,
                                            language=detect_language,
-                                           initial_prompt=None if detect_language != 'zh' else config.settings['initial_prompt_zh'], )
+                                           initial_prompt=config.settings['initial_prompt_zh'])
             for t in segments:
-                if detect_language == 'zh' and t.text == config.settings['initial_prompt_zh']:
+                if t.text == config.settings['initial_prompt_zh']:
                     continue
                 start_time, end_time, buffered = duration
                 text = t.text
                 text = f"{text.capitalize()}. ".replace('&#39;', "'")
                 text = re.sub(r'&#\d+;', '', text).strip().strip('.')
-                if detect_language == 'zh' and text == config.settings['initial_prompt_zh']:
+                if text == config.settings['initial_prompt_zh']:
                     continue
                 if not text or re.match(r'^[，。、？‘’“”；：（｛｝【】）:;"\'\s \d`!@#$%^&*()_+=.,?/\\-]*$', text):
                     continue

diff --git a/videotrans/set.ini b/videotrans/set.ini
@@ -56,17 +56,24 @@ force_edit_srt=true
 ; ###############语句分割相关##################################
 ; statement segmentation related ##################################
 
-;用于 预先分割 和 整体识别 时，作为切割依据的最小静音片段ms，默认200ms 以及最大句子时长3s
+;faster-whisper字幕整体识别模式时启用自定义静音分割片段，true=启用，显存不足时，可以设为false禁用
+;Enable custom mute segmentation when subtitles are in overall recognition mode, true=enable, can be set to false to disable when video memory is insufficient.
+vad=true
+
+;用于 faster-whisper 时 VAD选项设置作为切割依据的最小静音片段ms，默认250ms 以及最大句子时长6s
 ;The minimum silent segmentation ms, default 200ms, and the maximum sentence length 3s are used for pre-segmentation and overall recognition as the basis for segmentation.
-overall_silence=200
-overall_maxsecs=3
+overall_silence=250
+overall_maxsecs=6
+overall_threshold=0.5
+overall_speech_pad_ms=100
+
 
-;用于均等分割时，作为切割依据的最小静音片段ms，默认200ms，即只有大于等于200ms的静音处才分割
+;用于均等分割时和openai模式时，作为切割依据的最小静音片段ms，默认200ms，即只有大于等于200ms的静音处才分割
 ; used for equal segmentation, as the basis for cutting the minimum silence segment ms, the default 200ms, that is, only greater than or equal to 200ms silence at the split
-voice_silence=200
-;用于均等分割时的每个切片时长 秒，默认 6s,即每个字幕时长大约都是6s
+voice_silence=250
+;用于均等分割时的每个切片时长 秒，默认 10s,即每个字幕时长大约都是10s
 ;seconds per slice for equalization, default 6s, i.e. each subtitle is about 6s.
-interval_split=6
+interval_split=10
 
 
 ;################翻译配音速度#############################
@@ -114,9 +121,9 @@ loop_backaudio=false
 ; cuda data type when recognizing subtitles from video, int8=consumes fewer resources, faster, lower precision, float32=consumes more resources, slower, higher precision, int8_float16=device of choice
 cuda_com_type=float32
 
-;中文语言的视频时，用于识别的提示词，可解决简体识别为繁体问题。但注意，有可能直接会将提示词作为识别结果返回
-;The prompt words used to recognize videos in Chinese language can solve the problem of recognizing simplified Chinese as traditional Chinese. But note that there is a possibility that the prompt word will be returned directly as the result of the recognition.
-initial_prompt_zh=
+;发送给whisper模型的提示词
+;Cue words sent to the whisper model.
+initial_prompt_zh=Please break sentences correctly and retain punctuation.
 
 ;字幕识别时，cpu进程
 ;cpu process during subtitle recognition
@@ -131,13 +138,11 @@ whisper_worker=1
 beam_size=5
 best_of=5
 
-;faster-whisper字幕整体识别模式时启用自定义静音分割片段，true=启用，显存不足时，可以设为false禁用
-;Enable custom mute segmentation when subtitles are in overall recognition mode, true=enable, can be set to false to disable when video memory is insufficient.
-vad=true
+
 
 ;0=占用更少GPU资源但效果略差，1=占用更多GPU资源同时效果更好
 ;0 = less GPU resources but slightly worse results, 1 = more GPU resources and better results at the same time
-temperature=1
+temperature=0
 
 ;同 temperature, true=占用更多GPU效果更好，false=占用更少GPU效果略差
 ; same as temperature, true=better with more GPUs, false=slightly worse with fewer GPUs