diff --git a/todo.yaml b/todo.yaml index be87931..f18831b 100755 --- a/todo.yaml +++ b/todo.yaml @@ -1,4 +1,5 @@ todo: +- wav to subtitles - compound words like checkmark vs check mark should destutter - whisper trims outside silence so head and tail never get hit - split on silence-ish instead of duration @@ -6,6 +7,6 @@ todo: scheduled: [] done: - todo: need to overlap without ANY puctuation, which i can do by breaking into words - ts: Tue Jan 2 18:23:00 MST 2024 + ts: Tue Jan 2 13:23:00 EST 2024 - todo: overlap without stop words - ts: Wed Jan 3 08:22:14 MST 2024 + ts: Wed Jan 3 03:22:14 EST 2024 diff --git a/wav_to_mkv.d/wav_subtitles.rs b/wav_to_mkv.d/wav_subtitles.rs new file mode 100644 index 0000000..f4a59b2 --- /dev/null +++ b/wav_to_mkv.d/wav_subtitles.rs @@ -0,0 +1,100 @@ +// This example is not going to build in this folder. +// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml + +use hound; +use std::fs::File; +use std::io::Write; +use whisper_rs::{FullParams, SamplingStrategy, WhisperContext}; + +/// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout. +fn main() -> Result<(), &'static str> { + let args: Vec = std::env::args().collect(); + + // Load a context and model. + let ctx = WhisperContext::new(&args[1]) + .expect("failed to load model"); + // Create a state + let mut state = ctx.create_state().expect("failed to create key"); + + // Create a params object for running the model. + // The number of past samples to consider defaults to 0. + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 }); + + // Edit params as needed. + // Set the number of threads to use to 1. + params.set_n_threads(1); + // Enable translation. + params.set_translate(true); + // Set the language to translate to to English. + params.set_language(Some("en")); + // Disable anything that prints to stdout. + params.set_print_special(false); + params.set_print_progress(false); + params.set_print_realtime(false); + params.set_print_timestamps(false); + + // Open the audio file. + let mut reader = hound::WavReader::open(&args[2]).expect("failed to open file"); + #[allow(unused_variables)] + let hound::WavSpec { + channels, + sample_rate, + bits_per_sample, + .. + } = reader.spec(); + + // Convert the audio to floating point samples. + let mut audio = whisper_rs::convert_integer_to_float_audio( + &reader + .samples::() + .map(|s| s.expect("invalid sample")) + .collect::>(), + ); + + // Convert audio to 16KHz mono f32 samples, as required by the model. + // These utilities are provided for convenience, but can be replaced with custom conversion logic. + // SIMD variants of these functions are also available on nightly Rust (see the docs). + if channels == 2 { + audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?; + } else if channels != 1 { + panic!(">2 channels unsupported"); + } + + if sample_rate != 16000 { + panic!("sample rate must be 16KHz"); + } + + // Run the model. + state.full(params, &audio[..]).expect("failed to run model"); + + // Create a file to write the transcript to. + let mut file = File::create("transcript.txt").expect("failed to create file"); + + // Iterate through the segments of the transcript. + let num_segments = state + .full_n_segments() + .expect("failed to get number of segments"); + for i in 0..num_segments { + // Get the transcribed text and timestamps for the current segment. + let segment = state + .full_get_segment_text(i) + .expect("failed to get segment"); + let start_timestamp = state + .full_get_segment_t0(i) + .expect("failed to get start timestamp"); + let end_timestamp = state + .full_get_segment_t1(i) + .expect("failed to get end timestamp"); + + // Print the segment to stdout. + println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment); + + // Format the segment information as a string. + let line = format!("[{} - {}]: {}\n", start_timestamp, end_timestamp, segment); + + // Write the segment information to the file. + file.write_all(line.as_bytes()) + .expect("failed to write to file"); + } + Ok(()) +} diff --git a/wav_to_mkv.d/wav_to_mkv.sh b/wav_to_mkv.d/wav_to_mkv.sh new file mode 100644 index 0000000..b65570c --- /dev/null +++ b/wav_to_mkv.d/wav_to_mkv.sh @@ -0,0 +1,55 @@ +#! /bin/bash + +main() { + set -euo pipefail + + input_wav="$1" + model="${2:-../models/ggml-small.en.bin}" + already_transcribed="${3:-false}" + + sanitized_wav="${input_wav%.*}.mono-16khz.wav" + ffmpeg -y -i "$input_wav" -ac 1 -ar 16k "$sanitized_wav" + + if ! $already_transcribed; then + cargo run --example wav_subtitles -- "$model" "$sanitized_wav" + fi + out_to_srt ./transcript.txt > "${input_wav%.*}.srt" + + ffmpeg -y -i "$input_wav" -i "${input_wav%.*}.srt" "${input_wav%.*}.mkv" + ls "${input_wav%.*}.mkv" +} + +out_to_srt() { + cs_to_ts() { + echo "$1" | awk '{ + printf "%02d:%02d:%02d,000", + int(($1/100.0)/60/60), + int(($1/100.0)/60%60), + int(($1/100.0)%60) + }' + } + + cat "$1" \ + | ( + i=0 + while read -r line; do + ((i+=1)) + echo "$i" + echo "$(cs_to_ts "$( + echo "${line%%:] *}" \ + | tr -d '[' \ + | awk '{print $1}' + )") --> $(cs_to_ts "$( + echo "${line%%:] *}" \ + | tr -d '[' \ + | awk '{print $3}' + )")" + echo "${line#*: }" + echo + done + ) +} + +if [ "$0" == "$BASH_SOURCE" ]; then + main "$@" +fi