wav to mkv with subtitles scripting
parent
5fdc60e32c
commit
2254afcbfb
|
|
@ -1,4 +1,5 @@
|
||||||
todo:
|
todo:
|
||||||
|
- wav to subtitles
|
||||||
- compound words like checkmark vs check mark should destutter
|
- compound words like checkmark vs check mark should destutter
|
||||||
- whisper trims outside silence so head and tail never get hit
|
- whisper trims outside silence so head and tail never get hit
|
||||||
- split on silence-ish instead of duration
|
- split on silence-ish instead of duration
|
||||||
|
|
@ -6,6 +7,6 @@ todo:
|
||||||
scheduled: []
|
scheduled: []
|
||||||
done:
|
done:
|
||||||
- todo: need to overlap without ANY puctuation, which i can do by breaking into words
|
- todo: need to overlap without ANY puctuation, which i can do by breaking into words
|
||||||
ts: Tue Jan 2 18:23:00 MST 2024
|
ts: Tue Jan 2 13:23:00 EST 2024
|
||||||
- todo: overlap without stop words
|
- todo: overlap without stop words
|
||||||
ts: Wed Jan 3 08:22:14 MST 2024
|
ts: Wed Jan 3 03:22:14 EST 2024
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,100 @@
|
||||||
|
// This example is not going to build in this folder.
|
||||||
|
// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml
|
||||||
|
|
||||||
|
use hound;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::Write;
|
||||||
|
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
|
||||||
|
|
||||||
|
/// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout.
|
||||||
|
fn main() -> Result<(), &'static str> {
|
||||||
|
let args: Vec<String> = std::env::args().collect();
|
||||||
|
|
||||||
|
// Load a context and model.
|
||||||
|
let ctx = WhisperContext::new(&args[1])
|
||||||
|
.expect("failed to load model");
|
||||||
|
// Create a state
|
||||||
|
let mut state = ctx.create_state().expect("failed to create key");
|
||||||
|
|
||||||
|
// Create a params object for running the model.
|
||||||
|
// The number of past samples to consider defaults to 0.
|
||||||
|
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
|
||||||
|
|
||||||
|
// Edit params as needed.
|
||||||
|
// Set the number of threads to use to 1.
|
||||||
|
params.set_n_threads(1);
|
||||||
|
// Enable translation.
|
||||||
|
params.set_translate(true);
|
||||||
|
// Set the language to translate to to English.
|
||||||
|
params.set_language(Some("en"));
|
||||||
|
// Disable anything that prints to stdout.
|
||||||
|
params.set_print_special(false);
|
||||||
|
params.set_print_progress(false);
|
||||||
|
params.set_print_realtime(false);
|
||||||
|
params.set_print_timestamps(false);
|
||||||
|
|
||||||
|
// Open the audio file.
|
||||||
|
let mut reader = hound::WavReader::open(&args[2]).expect("failed to open file");
|
||||||
|
#[allow(unused_variables)]
|
||||||
|
let hound::WavSpec {
|
||||||
|
channels,
|
||||||
|
sample_rate,
|
||||||
|
bits_per_sample,
|
||||||
|
..
|
||||||
|
} = reader.spec();
|
||||||
|
|
||||||
|
// Convert the audio to floating point samples.
|
||||||
|
let mut audio = whisper_rs::convert_integer_to_float_audio(
|
||||||
|
&reader
|
||||||
|
.samples::<i16>()
|
||||||
|
.map(|s| s.expect("invalid sample"))
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Convert audio to 16KHz mono f32 samples, as required by the model.
|
||||||
|
// These utilities are provided for convenience, but can be replaced with custom conversion logic.
|
||||||
|
// SIMD variants of these functions are also available on nightly Rust (see the docs).
|
||||||
|
if channels == 2 {
|
||||||
|
audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?;
|
||||||
|
} else if channels != 1 {
|
||||||
|
panic!(">2 channels unsupported");
|
||||||
|
}
|
||||||
|
|
||||||
|
if sample_rate != 16000 {
|
||||||
|
panic!("sample rate must be 16KHz");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the model.
|
||||||
|
state.full(params, &audio[..]).expect("failed to run model");
|
||||||
|
|
||||||
|
// Create a file to write the transcript to.
|
||||||
|
let mut file = File::create("transcript.txt").expect("failed to create file");
|
||||||
|
|
||||||
|
// Iterate through the segments of the transcript.
|
||||||
|
let num_segments = state
|
||||||
|
.full_n_segments()
|
||||||
|
.expect("failed to get number of segments");
|
||||||
|
for i in 0..num_segments {
|
||||||
|
// Get the transcribed text and timestamps for the current segment.
|
||||||
|
let segment = state
|
||||||
|
.full_get_segment_text(i)
|
||||||
|
.expect("failed to get segment");
|
||||||
|
let start_timestamp = state
|
||||||
|
.full_get_segment_t0(i)
|
||||||
|
.expect("failed to get start timestamp");
|
||||||
|
let end_timestamp = state
|
||||||
|
.full_get_segment_t1(i)
|
||||||
|
.expect("failed to get end timestamp");
|
||||||
|
|
||||||
|
// Print the segment to stdout.
|
||||||
|
println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
|
||||||
|
|
||||||
|
// Format the segment information as a string.
|
||||||
|
let line = format!("[{} - {}]: {}\n", start_timestamp, end_timestamp, segment);
|
||||||
|
|
||||||
|
// Write the segment information to the file.
|
||||||
|
file.write_all(line.as_bytes())
|
||||||
|
.expect("failed to write to file");
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
#! /bin/bash
|
||||||
|
|
||||||
|
main() {
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
input_wav="$1"
|
||||||
|
model="${2:-../models/ggml-small.en.bin}"
|
||||||
|
already_transcribed="${3:-false}"
|
||||||
|
|
||||||
|
sanitized_wav="${input_wav%.*}.mono-16khz.wav"
|
||||||
|
ffmpeg -y -i "$input_wav" -ac 1 -ar 16k "$sanitized_wav"
|
||||||
|
|
||||||
|
if ! $already_transcribed; then
|
||||||
|
cargo run --example wav_subtitles -- "$model" "$sanitized_wav"
|
||||||
|
fi
|
||||||
|
out_to_srt ./transcript.txt > "${input_wav%.*}.srt"
|
||||||
|
|
||||||
|
ffmpeg -y -i "$input_wav" -i "${input_wav%.*}.srt" "${input_wav%.*}.mkv"
|
||||||
|
ls "${input_wav%.*}.mkv"
|
||||||
|
}
|
||||||
|
|
||||||
|
out_to_srt() {
|
||||||
|
cs_to_ts() {
|
||||||
|
echo "$1" | awk '{
|
||||||
|
printf "%02d:%02d:%02d,000",
|
||||||
|
int(($1/100.0)/60/60),
|
||||||
|
int(($1/100.0)/60%60),
|
||||||
|
int(($1/100.0)%60)
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
|
cat "$1" \
|
||||||
|
| (
|
||||||
|
i=0
|
||||||
|
while read -r line; do
|
||||||
|
((i+=1))
|
||||||
|
echo "$i"
|
||||||
|
echo "$(cs_to_ts "$(
|
||||||
|
echo "${line%%:] *}" \
|
||||||
|
| tr -d '[' \
|
||||||
|
| awk '{print $1}'
|
||||||
|
)") --> $(cs_to_ts "$(
|
||||||
|
echo "${line%%:] *}" \
|
||||||
|
| tr -d '[' \
|
||||||
|
| awk '{print $3}'
|
||||||
|
)")"
|
||||||
|
echo "${line#*: }"
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "$0" == "$BASH_SOURCE" ]; then
|
||||||
|
main "$@"
|
||||||
|
fi
|
||||||
Loading…
Reference in New Issue