real real-time transcription(real-time subtitle) with English online-…

…TV on Xiaomi 14 at the first time but bug-fix is still required
zhouwg · Mar 20, 2024 · 4cd35dd · 4cd35dd · zhouwg · Mar 20, 2024
1 parent 8b14792
commit 4cd35dd
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 30 deletions.
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/app/IApplication.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/app/IApplication.java
@@ -370,6 +370,7 @@ public void initGlobal() {
             CDELibraryLoader.load("whispercpp");
             CDELog.d(TAG, "cpu core counts:" + whispercpp.get_cpu_core_counts());
             CDELog.j(TAG, "asr mode: " + mSettings.getASRMode());
+            CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
             if ((CDEUtils.ASR_MODE_NORMAL == mSettings.getASRMode()) || (CDEUtils.ASR_MODE_TRANSCRIPTION_RECORD == mSettings.getASRMode())) {
                 result = whispercpp.asr_init(modelPath, mSettings.getASRThreadCounts(), WHISPER_ASR_MODE_NORMAL);
             } else {

diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/player/ffplayer/FFPlayerView.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/player/ffplayer/FFPlayerView.java
@@ -1575,6 +1575,7 @@ private void onASRStart(int asrMode) {
             return;
         } else {
             CDELog.j(TAG, "ASR with GGML model file:" + file.getAbsolutePath());
+            CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
         }
 
         if (CDEUtils.getASRSubsystemInit()) {

diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/settings/ASRSettingFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/settings/ASRSettingFragment.java
@@ -120,37 +120,18 @@ public void onPause() {
          @Override
          public void onSharedPreferenceChanged(SharedPreferences sharedPreferences, String key) {
              CDELog.j(TAG, "key : " + key);
-             if (key.contains("pref.asrmode")) {
-                 CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
-                 CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
-                 CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
-                 CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
-                 String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
-                 CDELog.j(TAG, "modelPath:" + modelPath);
-                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
-             }
-
-             if (key.contains("pref.asrthreadcounts")) {
-                 CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
-                 CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts() + 1);
-                 CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
-                 CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
-                 String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
-                 CDELog.j(TAG, "modelPath:" + modelPath);
-                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
-             }
-
-
-             if (key.contains("pref.ggmlmodel")) {
-                 CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
-                 CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
+             if (
+                 (key.contains("pref.asrmode"))
+                 || (key.contains("pref.asrthreadcounts"))
+                 || (key.contains("pref.ggmlmodel"))
+             ) {
                  CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
                  CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
                  CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
                  CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
                  String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
                  CDELog.j(TAG, "modelPath:" + modelPath);
-                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
+                 CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts(), mSettings.getASRMode());
              }
          }
      };

diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/utils/Settings.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/utils/Settings.java
@@ -78,12 +78,12 @@ public int getASRMode() {
 
     public int getASRThreadCounts() {
         String key = mAppContext.getString(R.string.pref_key_asrthreadcounts);
-        String value = mSharedPreferences.getString(key, "3"); // thread counts 4
+        String value = mSharedPreferences.getString(key, "3"); // actual thread counts is 3 + 1 = 4
         try {
-            return Integer.valueOf(value).intValue();
+            return Integer.valueOf(value).intValue() + 1;
         } catch (NumberFormatException e) {
             CDELog.j(TAG, "exception occurred");
-            return 3;
+            return 4;
         }
     }
 

diff --git a/external/.gitignore b/external/.gitignore
@@ -4,7 +4,12 @@ gstreamer/
 ncnn/
 CLBlast/
 llamacpp/
+ff-deps/
+ffdeps/
+ffmepg-deps/
 
 
+ffmpeg-6.1
+
 *.a
 *.so
diff --git a/external/whispercpp/kantv-asr.h → external/whispercpp/jni/kantv-asr.h b/external/whispercpp/kantv-asr.h → external/whispercpp/jni/kantv-asr.h
diff --git a/external/whispercpp/jni/whispercpp-jni-impl.cpp b/external/whispercpp/jni/whispercpp-jni-impl.cpp
@@ -124,6 +124,9 @@ typedef struct {
     char  sz_model_path[MAX_PATH_LEN];
     size_t n_threads;
 
+    //03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
+    size_t n_decoding_mode;         // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH
+
     size_t n_asr_mode;                              // 0: normal transcription  1: asr pressure test 2:benchmark 3: transcription + audio record
     size_t n_benchmark_type;                        // what to benchmark: 0: asr, 1: memcpy 2: mulmat  3: whisper_encode/whisper full benchmark
     bool   b_use_gpu;
@@ -847,7 +850,9 @@ class whisper_asr {
             n_end_time = ggml_time_us();
             n_durtion = (n_end_time - n_begin_time) / 1000;
 
-            if (n_durtion > 1000) { // 1 seconds, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
+            // 1 second, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
+            // 0.8 second with new method(adjust audio_context dynamically) would cause app crash suddenly or produce sketchy/incorrect/repeat tokens
+            if (n_durtion > 900) {
                 LOGGD("duration of audio data gathering is: %d milliseconds\n", n_durtion);
                 LOGGD("size of gathered audio data: %d\n", _n_whisper_in_size);
                 LOGGD("total audio sample counts %d\n", _n_total_sample_counts);
@@ -1186,6 +1191,21 @@ static const char * whisper_asr_audio_to_text(const float * pf32_audio_buffer, i
 
     begin_time = ggml_time_ms();
     whisper_reset_timings(p_asr_ctx->p_context);
+
+    //03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
+    p_asr_ctx->p_params->max_tokens        = 256;
+    p_asr_ctx->p_params->temperature_inc   = 0.0f;
+    p_asr_ctx->p_params->audio_ctx         = std::min(1500, (int)ceil((double)num_samples / (double)(320.0)) + 16);
+    if (WHISPER_SAMPLING_GREEDY == p_asr_ctx->n_decoding_mode) {
+        p_asr_ctx->p_params->strategy = WHISPER_SAMPLING_GREEDY;
+        p_asr_ctx->p_params->greedy.best_of = 1;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
+    } else {
+        p_asr_ctx->p_params->strategy               = WHISPER_SAMPLING_BEAM_SEARCH;
+        p_asr_ctx->p_params->beam_search.beam_size  = 5;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
+        p_asr_ctx->p_params->greedy.best_of         = 5;
+    }
+    //LOGGD("decoding_mode=%d, audio_ctx=%d\n", p_asr_ctx->n_decoding_mode, p_asr_ctx->p_params->audio_ctx);
+
     result = whisper_full(p_asr_ctx->p_context, *p_asr_ctx->p_params, pf32_audio_buffer, num_samples);
     if (0 != result) {
         LOGW("whisper inference failure, pls check why?\n");
@@ -1350,9 +1370,19 @@ int whisper_asr_init(const char * sz_model_path, int n_threads, int n_asrmode) {
      params.speed_up                = false;
      params.debug_mode              = false;
 
+     params.audio_ctx               = 0;
+
+     params.suppress_blank              = false;
+     //params.suppress_non_speech_tokens  = true;
+     //params.language                    = "en";
+
+     //03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
+     p_asr_ctx->n_decoding_mode         = WHISPER_SAMPLING_GREEDY;
+
+
      //params.tdrz_enable                  = false;//whisper complain failed to compute log mel spectrogram when this flag was enabled
      //params.suppress_blank               = true;
-     //params.suppress_non_speech_tokens   = true;
+     params.suppress_non_speech_tokens   = true;
 
      memcpy(p_asr_ctx->p_params, &params, sizeof(struct whisper_full_params));