wav to mkv with subtitles scripting

2024-09-21 17:34:20 -04:00
parent 5fdc60e32c
commit 2254afcbfb
3 changed files with 158 additions and 2 deletions
--- a/todo.yaml
+++ b/todo.yaml
@@ -1,4 +1,5 @@
 todo:
+- wav to subtitles
 - compound words like checkmark vs check mark should destutter
 - whisper trims outside silence so head and tail never get hit
 - split on silence-ish instead of duration
@@ -6,6 +7,6 @@ todo:
 scheduled: []
 done:
 - todo: need to overlap without ANY puctuation, which i can do by breaking into words
-  ts: Tue Jan  2 18:23:00 MST 2024
+  ts: Tue Jan  2 13:23:00 EST 2024
 - todo: overlap without stop words
-  ts: Wed Jan  3 08:22:14 MST 2024
+  ts: Wed Jan  3 03:22:14 EST 2024
--- a/wav_to_mkv.d/wav_subtitles.rs
+++ b/wav_to_mkv.d/wav_subtitles.rs
@@ -0,0 +1,100 @@
+// This example is not going to build in this folder.
+// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml
+
+use hound;
+use std::fs::File;
+use std::io::Write;
+use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
+
+/// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout.
+fn main() -> Result<(), &'static str> {
+   let args: Vec<String> = std::env::args().collect();
+
+    // Load a context and model.
+    let ctx = WhisperContext::new(&args[1])
+        .expect("failed to load model");
+    // Create a state
+    let mut state = ctx.create_state().expect("failed to create key");
+
+    // Create a params object for running the model.
+    // The number of past samples to consider defaults to 0.
+    let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
+
+    // Edit params as needed.
+    // Set the number of threads to use to 1.
+    params.set_n_threads(1);
+    // Enable translation.
+    params.set_translate(true);
+    // Set the language to translate to to English.
+    params.set_language(Some("en"));
+    // Disable anything that prints to stdout.
+    params.set_print_special(false);
+    params.set_print_progress(false);
+    params.set_print_realtime(false);
+    params.set_print_timestamps(false);
+
+    // Open the audio file.
+    let mut reader = hound::WavReader::open(&args[2]).expect("failed to open file");
+    #[allow(unused_variables)]
+    let hound::WavSpec {
+        channels,
+        sample_rate,
+        bits_per_sample,
+        ..
+    } = reader.spec();
+
+    // Convert the audio to floating point samples.
+    let mut audio = whisper_rs::convert_integer_to_float_audio(
+        &reader
+            .samples::<i16>()
+            .map(|s| s.expect("invalid sample"))
+            .collect::<Vec<_>>(),
+    );
+
+    // Convert audio to 16KHz mono f32 samples, as required by the model.
+    // These utilities are provided for convenience, but can be replaced with custom conversion logic.
+    // SIMD variants of these functions are also available on nightly Rust (see the docs).
+    if channels == 2 {
+        audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?;
+    } else if channels != 1 {
+        panic!(">2 channels unsupported");
+    }
+
+    if sample_rate != 16000 {
+        panic!("sample rate must be 16KHz");
+    }
+
+    // Run the model.
+    state.full(params, &audio[..]).expect("failed to run model");
+
+    // Create a file to write the transcript to.
+    let mut file = File::create("transcript.txt").expect("failed to create file");
+
+    // Iterate through the segments of the transcript.
+    let num_segments = state
+        .full_n_segments()
+        .expect("failed to get number of segments");
+    for i in 0..num_segments {
+        // Get the transcribed text and timestamps for the current segment.
+        let segment = state
+            .full_get_segment_text(i)
+            .expect("failed to get segment");
+        let start_timestamp = state
+            .full_get_segment_t0(i)
+            .expect("failed to get start timestamp");
+        let end_timestamp = state
+            .full_get_segment_t1(i)
+            .expect("failed to get end timestamp");
+
+        // Print the segment to stdout.
+        println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
+
+        // Format the segment information as a string.
+        let line = format!("[{} - {}]: {}\n", start_timestamp, end_timestamp, segment);
+
+        // Write the segment information to the file.
+        file.write_all(line.as_bytes())
+            .expect("failed to write to file");
+    }
+    Ok(())
+}
--- a/wav_to_mkv.d/wav_to_mkv.sh
+++ b/wav_to_mkv.d/wav_to_mkv.sh
@@ -0,0 +1,55 @@
+#! /bin/bash
+
+main() {
+   set -euo pipefail
+
+   input_wav="$1"
+   model="${2:-../models/ggml-small.en.bin}"
+   already_transcribed="${3:-false}"
+
+   sanitized_wav="${input_wav%.*}.mono-16khz.wav"
+   ffmpeg -y -i "$input_wav" -ac 1 -ar 16k "$sanitized_wav"
+
+   if ! $already_transcribed; then
+      cargo run --example wav_subtitles -- "$model" "$sanitized_wav"
+   fi
+   out_to_srt ./transcript.txt > "${input_wav%.*}.srt"
+
+   ffmpeg -y -i "$input_wav" -i "${input_wav%.*}.srt" "${input_wav%.*}.mkv"
+   ls "${input_wav%.*}.mkv"
+}
+
+out_to_srt() {
+   cs_to_ts() {
+      echo "$1" | awk '{
+         printf "%02d:%02d:%02d,000",
+            int(($1/100.0)/60/60),
+            int(($1/100.0)/60%60),
+            int(($1/100.0)%60)
+      }'
+   }
+
+   cat "$1" \
+   | (
+      i=0
+      while read -r line; do
+         ((i+=1))
+         echo "$i"
+         echo "$(cs_to_ts "$(
+            echo "${line%%:]  *}" \
+            | tr -d '[' \
+            | awk '{print $1}'
+         )") --> $(cs_to_ts "$(
+            echo "${line%%:]  *}" \
+            | tr -d '[' \
+            | awk '{print $3}'
+         )")"
+         echo "${line#*:  }"
+         echo
+      done
+   )
+}
+
+if [ "$0" == "$BASH_SOURCE" ]; then
+   main "$@"
+fi