Merge branch 'main' of https://github.com/Huanshere/VideoLingo

ishine · Sep 12, 2024 · 62afd41 · 62afd41
2 parents b3574b7 + 58fe0cf
commit 62afd41
Show file tree

Hide file tree

Showing 11 changed files with 118 additions and 338 deletions.
diff --git a/ALL_IN_ONE.ipynb b/ALL_IN_ONE.ipynb
diff --git a/README.md b/README.md
@@ -66,9 +66,8 @@ https://github.com/user-attachments/assets/0f5d5878-bfa5-41e4-ade1-d2b81d925a7d
 
 | 限制 | 当前 | 计划 |
 |------|----------|--------------|
-| 安装步骤 | 需要一定的代码能力和计算资源 | 将Whisper部分上传到Replicate云 |
 | 音频长度 | 仅支持30分钟以内 | 将很快扩展这一限制 |
-| 多语言支持 | 英文较准确<br>其他语言（如日语）精度待提高 | 引入针对不同语言的专门模型 |
+| 多语言支持 | 英语识别效果较好<br>日语识别效果一般<br>中文识别非常不稳定且容易报错 | 引入针对不同语言的专门模型 |
 
 ## 🙏 致谢
 

diff --git a/config.example.py b/config.example.py
@@ -5,16 +5,15 @@
 ## ======================== 基本设置 ======================== ##
 ## ======================== Basic Settings ======================== ##
 
-# API 设置 建议使用唯一真神 https://api.wlai.vip/register?aff=TXMB, sonnet 价格仅 10r/1M。
-# API Settings. Recommended to use the one true god https://api.wlai.vip/register?aff=TXMB, sonnet price is only 10r/1M.
-# 申请令牌时勾选模型`claude-3-5-sonnet-20240620`，渠道建议选`默认渠道1.0`
-# When applying for a token, check the model `claude-3-5-sonnet-20240620`, recommended to choose `Default Channel 1.0`
+# API Settings
+# 为了最好的效果，请使用 claude-3.5-sonnet. 实测 deepseek-coder 也能有较好的效果且性价比高
+# For best results, please use claude-3.5-sonnet. In practice, deepseek-coder also performs well with lower cost.
 API_KEY = 'sk-xxx'
-BASE_URL = 'https://api2.wlai.vip'
-MODEL = ['claude-3-5-sonnet-20240620']
+BASE_URL = 'https://api.deepseek.com'
+MODEL = ['deepseek-coder']
 
 # Replicate API 设置
-# Replicate API settings
+# Replicate API settings for using whisperX
 REPLICATE_API_TOKEN = "xxx"
 
 # 语言设置，用自然语言描述
@@ -43,7 +42,7 @@
 
 # Whisper 设置 [whisperx, whisperxapi, whisper_timestamped]
 # Whisper settings [whisperx, whisperxapi, whisper_timestamped]
-WHISPER_METHOD = 'whisperx'
+WHISPER_METHOD = 'whisperxapi'
 
 # 预留给 whisper_timestamped 的模型，英语场景下 medium 甚至比 large-v2 的时间轴还准
 # Reserved for whisper_timestamped model, in English scenarios, medium is even more accurate in timeline than large-v2
@@ -96,8 +95,8 @@
 # Spacy model
 # Spacy 模型
 SPACY_MODEL_MAP = {
-    "en": "en_core_web_lg",
-    "zh": "zh_core_web_lg",
+    "en": "en_core_web_sm",
+    "zh": "zh_core_web_sm",
     "es": "es_core_news_lg",
     "fr": "fr_core_news_lg",
     "de": "de_core_news_lg",
@@ -136,8 +135,7 @@ def get_joiner(language):
     elif language in LANGUAGE_SPLIT_WITHOUT_SPACE:
         return ""
     else:
-        raise ValueError(f"不支持的语言代码: {language}")
-        # raise ValueError(f"Unsupported language code: {language}")
+        raise ValueError(f"Unsupported language code: {language}")
 
 
 # 配音设置 暂时弃用

diff --git a/core/all_whisper_methods/whisperX.py b/core/all_whisper_methods/whisperX.py
@@ -71,18 +71,29 @@ def transcribe_audio(audio_file: str) -> Dict:
 
 def process_transcription(result: Dict) -> pd.DataFrame:
     all_words = []
+    # save to debug as json 
+    with open('output/log/debug.json', 'a', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=4)
     for segment in result['segments']:
-        for i, word in enumerate(segment['words']):
-            if 'start' not in word and i > 0:
-                all_words[-1]['text'] = f'{all_words[-1]["text"][:-1]}{word["word"]}"'
+        for word in segment['words']:
+            if 'start' not in word and 'end' not in word:
+                if all_words:
+                    # 合并到前一个词
+                    all_words[-1]['text'] = f'{all_words[-1]["text"][:-1]}{word["word"]}"'
+                else:
+                    # 如果是第一个词，暂时保存，等待下一个有时间戳的词
+                    temp_word = word["word"]
             else:
+                # 正常情况，有开始和结束时间
                 word_dict = {
-                    'text': f'{word["word"]}',
+                    'text': f'"{temp_word}{word["word"]}"' if 'temp_word' in locals() else f'"{word["word"]}"',
                     'start': word.get('start', all_words[-1]['end'] if all_words else 0),
                     'end': word['end'],
                     'score': word.get('score', 0)
                 }
                 all_words.append(word_dict)
+                if 'temp_word' in locals():
+                    del temp_word
 
     return pd.DataFrame(all_words)