12 Commits

Author SHA1 Message Date
Bel LaPointe
fffea2ddf0 no render mac 2025-09-10 11:20:01 -06:00
Bel LaPointe
12dbf12299 k 2024-09-21 21:33:40 -04:00
Bel LaPointe
f04a55590f fixed 2024-09-21 21:33:40 -04:00
Bel LaPointe
2254afcbfb wav to mkv with subtitles scripting 2024-09-21 21:33:40 -04:00
bel
5fdc60e32c stem words for destuttering 2024-01-03 20:40:35 -07:00
bel
4c80247ab9 accept lower sample rates if 16k not avail 2024-01-03 17:18:07 -07:00
bel
53e675b9a0 no panic on unusable mic 2024-01-03 17:09:27 -07:00
Bel LaPointe
9780c6f2ef todo 2024-01-03 08:50:59 -07:00
Bel LaPointe
7f902af26f default update 2024-01-03 08:40:13 -07:00
Bel LaPointe
9bc009996c oop 2024-01-03 08:38:24 -07:00
Bel LaPointe
cbc8a4f9fd cargo run -- --stream-step 8 --stream-retain 4 --stream-head=2 --stream-tail=0 2> /dev/null 2024-01-03 08:37:27 -07:00
Bel LaPointe
a8c8140d18 functionize at least 2024-01-03 08:28:22 -07:00
12 changed files with 447 additions and 182 deletions

2
.gitmodules vendored
View File

@@ -1,3 +1,3 @@
[submodule "rust-whisper.d/gitea-whisper-rs"]
[submodule "gitea-whisper-rs"]
path = gitea-whisper-rs
url = https://gitea.inhome.blapointe.com/bel/whisper-rs.git

View File

@@ -13,6 +13,7 @@ if ! which rust-whisper-baked; then
fi >&2
cat <<EOF
rust-whisper-baked --stream-device pulse_monitor --stream-step 16 --stream-retain 8 --stream-{head,tail}=0.25 2> /dev/null
rust-whisper-baked --stream-device 'BlackHole 2ch' --stream-step 30 --stream-retain 1 --stream-{head,tail}=0.25 --threads 9 2> /dev/null
| tee -a "$HOME/Sync/drawful/DnD/bdoob/__log.d/$(date +%Y.%m.%d).transcript.txt"
| tee -a "$HOME/Sync/drawful/DnD/nessira.d/_log.d/$(date +%Y.%m.%d).transcript.txt"

View File

@@ -25,7 +25,11 @@ pub fn devices() -> Vec<String> {
fn _devices() -> Result<Vec<cpal::Device>, String> {
match cpal::default_host().devices() {
Ok(devices) => Ok(devices.filter(|device| {
device.supported_input_configs().unwrap().count() > 0
let input_configs = device.supported_input_configs();
if !input_configs.is_ok() {
return false;
}
input_configs.unwrap().count() > 0
}).collect()),
Err(msg) => Err(format!("failed to get devices: {}", msg)),
}
@@ -92,13 +96,22 @@ impl Listener {
filter(|device| device.name().unwrap() == self.device_name).
collect::<Vec<_>>();
let device = devices.first().unwrap();
let cfg = device.supported_input_configs()
let mut sample_rate = 15_500;
let mut cfgs: Vec<_> = device.supported_input_configs()
.unwrap()
.filter(|x| x.sample_format() == cpal::SampleFormat::F32)
.filter(|x| x.min_sample_rate() >= cpal::SampleRate(15_500))
.nth(0)
.filter(|x| x.min_sample_rate() >= cpal::SampleRate(sample_rate))
.collect();
while cfgs.len() == 0 && sample_rate > 0 {
sample_rate /= 2;
cfgs = device.supported_input_configs()
.unwrap()
.with_max_sample_rate();
.filter(|x| x.sample_format() == cpal::SampleFormat::F32)
.filter(|x| x.min_sample_rate() >= cpal::SampleRate(sample_rate))
.collect();
}
assert!(cfgs.len() > 0);
let cfg = cfgs[0].clone().with_max_sample_rate();
let downsample_ratio = cfg.channels() as f32 * (cfg.sample_rate().0 as f32 / 16_000.0);
let stream = device.build_input_stream(

View File

@@ -6,7 +6,7 @@ pub fn channel<F>(
stream: std::sync::mpsc::Receiver<Vec<f32>>,
) where F: FnMut(Result<rust_whisper_lib::Transcribed, String>) + Send + 'static {
flags.model_path = None;
flags.model_buffer = Some(include_bytes!("../../models/ggml-tiny.en.bin").to_vec());
flags.model_buffer = Some(get_fast());
rust_whisper_lib::channel(flags.clone(), handler_fn, stream);
}
@@ -15,7 +15,7 @@ pub fn wav<F>(
handler_fn: F
) where F: FnMut(Result<rust_whisper_lib::Transcribed, String>) + Send + 'static {
flags.model_path = None;
flags.model_buffer = Some(include_bytes!("../../models/ggml-distil-medium.en.bin").to_vec());
flags.model_buffer = Some(get_good());
rust_whisper_lib::wav(flags.clone(), handler_fn, flags.wav.unwrap());
}
@@ -24,10 +24,18 @@ pub fn wav_channel<F>(
handler_fn: F
) where F: FnMut(Result<rust_whisper_lib::Transcribed, String>) + Send + 'static {
flags.model_path = None;
flags.model_buffer = Some(include_bytes!("../../models/ggml-distil-medium.en.bin").to_vec());
flags.model_buffer = Some(get_good());
rust_whisper_lib::wav_channel(flags, handler_fn);
}
pub fn f32_from_wav_file(path: &String) -> Result<Vec<f32>, String> {
rust_whisper_lib::f32_from_wav_file(path)
}
fn get_fast() -> Vec<u8> {
include_bytes!("../../models/ggml-small.en.bin").to_vec()
}
fn get_good() -> Vec<u8> {
include_bytes!("../../models/ggml-distil-medium.en.bin").to_vec()
}

View File

@@ -763,12 +763,23 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9b1a3d5f46d53f4a3478e2be4a5a5ce5108ea58b100dcd139830eae7f79a3a1"
[[package]]
name = "rust-stemmers"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
dependencies = [
"serde",
"serde_derive",
]
[[package]]
name = "rust-whisper-baked"
version = "0.1.0"
dependencies = [
"clap",
"listen-lib",
"rust-stemmers",
"rust-whisper-baked-lib",
"rust-whisper-lib",
"stop-words",

View File

@@ -11,3 +11,4 @@ rust-whisper-baked-lib = { path = "../rust-whisper-baked-lib" }
listen-lib = { path = "../listen-lib" }
clap = { version = "4.4.10", features = ["derive"] }
stop-words = "0.8.0"
rust-stemmers = "1.2.0"

View File

@@ -1,7 +1,7 @@
use rust_whisper_lib;
use rust_whisper_baked_lib;
use clap::Parser;
use listen_lib;
use rust_whisper_baked_lib;
use rust_whisper_lib;
use std::thread;
fn main() {
@@ -21,8 +21,10 @@ fn wav_channel(flags: rust_whisper_lib::Flags) {
Ok(transcribed) => {
let s = w.step(transcribed.to_string());
println!("{}", s);
},
Err(msg) => { eprintln!("error: {}", msg); },
}
Err(msg) => {
eprintln!("error: {}", msg);
}
};
},
);
@@ -30,14 +32,17 @@ fn wav_channel(flags: rust_whisper_lib::Flags) {
fn wav(flags: rust_whisper_lib::Flags, _path: String) {
let mut w = new_destutterer();
rust_whisper_baked_lib::wav(flags,
rust_whisper_baked_lib::wav(
flags,
move |result: Result<rust_whisper_lib::Transcribed, String>| {
match result {
Ok(transcribed) => {
let s = w.step(transcribed.to_string());
println!("{}", s);
},
Err(msg) => { eprintln!("error: {}", msg); },
}
Err(msg) => {
eprintln!("error: {}", msg);
}
};
},
);
@@ -48,12 +53,18 @@ fn channel(flags: rust_whisper_lib::Flags) {
eprintln!("rust whisper baked lib channel...");
thread::spawn(move || {
let mut w = new_destutterer();
rust_whisper_baked_lib::channel(
flags.clone(),
|result: Result<rust_whisper_lib::Transcribed, String>| {
move |result: Result<rust_whisper_lib::Transcribed, String>| {
match result {
Ok(transcribed) => { println!("{}", transcribed.to_string()); },
Err(msg) => { eprintln!("error: {}", msg); },
Ok(transcribed) => {
let s = w.step(transcribed.to_string());
println!("{}", s);
}
Err(msg) => {
eprintln!("error: {}", msg);
}
};
},
recv,
@@ -64,17 +75,25 @@ fn channel(flags: rust_whisper_lib::Flags) {
let flags = rust_whisper_lib::Flags::parse();
match flags.stream_device {
Some(device_name) => {
if device_name == "" {
eprintln!("with device ({}) '{}'", device_name.len(), &device_name);
if device_name.len() == 0 {
let mut i = 0;
for device in listen_lib::devices() {
eprintln!("{}", device);
eprintln!("[{}] {}", i, device);
i += 1;
}
eprintln!("found {} devices", i);
} else {
listen_lib::main_with(|data| {
listen_lib::main_with(
|data| {
send.send(data).unwrap();
}, device_name);
}
},
device_name,
);
}
}
None => {
eprintln!("without any device");
listen_lib::main(|data| {
send.send(data).unwrap();
});
@@ -98,7 +117,10 @@ impl Destutterer {
}
let next_words = Words::from_string(next.clone());
let mut n = self.prev.to_comparable_words().len().clamp(0, next_words.to_comparable_words().len());
let mut n = self
.prev
.comparable_len()
.clamp(0, next_words.comparable_len());
//println!("n={} prev='{:?}' next='{:?}'", n, self.prev.to_comparable_words(), next_words.to_comparable_words());
while n > 0 {
let (prev_s, _) = self.prev.last_n_comparable_to_string(n);
@@ -144,13 +166,25 @@ impl Words {
fn last_n_comparable_to_string(&self, n: usize) -> (String, usize) {
let v = self.to_comparable_words();
let v = v[(v.len() - n).clamp(0, v.len())..].to_vec();
return (v.iter().map(|x| x.s.clone().unwrap()).collect::<Vec<String>>().join(" "), v[0].idx)
return (
v.iter()
.map(|x| x.s.clone().unwrap())
.collect::<Vec<String>>()
.join(" "),
v[0].idx,
);
}
fn first_n_comparable_to_string(&self, n: usize) -> (String, usize) {
let v = self.to_comparable_words();
let v = v[0..n.clamp(0, v.len())].to_vec();
return (v.iter().map(|x| x.s.clone().unwrap()).collect::<Vec<String>>().join(" "), v[v.len()-1].idx)
return (
v.iter()
.map(|x| x.s.clone().unwrap())
.collect::<Vec<String>>()
.join(" "),
v[v.len() - 1].idx,
);
}
fn comparable_len(&self) -> usize {
@@ -158,14 +192,26 @@ impl Words {
}
fn to_comparable_words(&self) -> Vec<Word> {
self.to_words().iter().filter(|x| x.s.is_some()).map(|x| x.clone()).collect()
self.to_words()
.iter()
.filter(|x| x.s.is_some())
.map(|x| x.clone())
.collect()
}
fn to_words(&self) -> Vec<Word> {
let skips = stop_words::get("en");
let strs = self.raw.iter()
let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
let strs = self
.raw
.iter()
.map(|w| w.to_lowercase())
.map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>())
.map(|w| {
w.chars()
.filter(|c| c.is_ascii_alphanumeric())
.collect::<String>()
})
.map(|w| stemmer.stem(&w).into_owned())
.collect::<Vec<String>>();
let mut result = vec![];
for i in 0..strs.len() {
@@ -181,7 +227,8 @@ impl Words {
}
fn to_string(&self) -> String {
self.raw.iter()
self.raw
.iter()
.map(|x| x.clone())
.collect::<Vec<String>>()
.join(" ")
@@ -201,22 +248,37 @@ mod tests {
#[test]
fn test_destutterer_stop_words() {
let mut w = new_destutterer();
assert_eq!("welcome to the internet".to_string(), w.step("welcome to the internet".to_string()));
assert_eq!("have a look around".to_string(), w.step("welcome to the a internet; have a look around".to_string()));
assert_eq!(
"welcome to the internet".to_string(),
w.step("welcome to the internet".to_string())
);
assert_eq!(
"have a look around".to_string(),
w.step("welcome to the a internet; have a look around".to_string())
);
}
#[test]
fn test_destutterer_punctuation() {
let mut w = new_destutterer();
assert_eq!("cat, dog. cow? moose!".to_string(), w.step("cat, dog. cow? moose!".to_string()));
assert_eq!("elephant! fez gator".to_string(), w.step("moose, elephant! fez gator".to_string()));
assert_eq!(
"cat, dog. cow? moose!".to_string(),
w.step("cat, dog. cow? moose!".to_string())
);
assert_eq!(
"elephant! fez gator".to_string(),
w.step("moose, elephant! fez gator".to_string())
);
assert_eq!("hij".to_string(), w.step("fez gator hij".to_string()));
}
#[test]
fn test_destutterer_basic() {
let mut w = new_destutterer();
assert_eq!("cat dog cow".to_string(), w.step(" cat dog cow ".to_string()));
assert_eq!(
"cat dog cow".to_string(),
w.step(" cat dog cow ".to_string())
);
assert_eq!("moose".to_string(), w.step(" dog cow moose ".to_string()));
}
}

View File

@@ -14,13 +14,13 @@ pub struct Flags {
#[arg(long, default_value = "8")]
pub threads: i32,
#[arg(long, default_value = "30")]
#[arg(long, default_value = "8")]
pub stream_step: u64,
#[arg(long, default_value = "28.0")]
#[arg(long, default_value = "4.0")]
pub stream_retain: f32,
#[arg(long, default_value = "0.1")]
#[arg(long, default_value = "2.0")]
pub stream_head: f32,
#[arg(long, default_value = "0.1")]
#[arg(long, default_value = "0.0")]
pub stream_tail: f32,
#[arg(long, default_value = "false")]

View File

@@ -1,9 +1,12 @@
todo:
- wav to subtitles
- compound words like checkmark vs check mark should destutter
- whisper trims outside silence so head and tail never get hit
- split on silence-ish instead of duration
- rust-whisper warn when transcription time ~ input time
scheduled: []
done:
- todo: need to overlap without ANY puctuation, which i can do by breaking into words
ts: Tue Jan 2 18:23:00 MST 2024
ts: Tue Jan 2 13:23:00 EST 2024
- todo: overlap without stop words
ts: Wed Jan 3 08:22:14 MST 2024
ts: Wed Jan 3 03:22:14 EST 2024

BIN
wav_to_mkv.d/sc.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

View File

@@ -0,0 +1,100 @@
// This example is not going to build in this folder.
// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml
use hound;
use std::fs::File;
use std::io::Write;
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
/// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout.
fn main() -> Result<(), &'static str> {
let args: Vec<String> = std::env::args().collect();
// Load a context and model.
let ctx = WhisperContext::new(&args[1])
.expect("failed to load model");
// Create a state
let mut state = ctx.create_state().expect("failed to create key");
// Create a params object for running the model.
// The number of past samples to consider defaults to 0.
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
// Edit params as needed.
// Set the number of threads to use to 1.
//params.set_n_threads(1);
// Enable translation.
params.set_translate(true);
// Set the language to translate to to English.
params.set_language(Some("en"));
// Disable anything that prints to stdout.
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
// Open the audio file.
let mut reader = hound::WavReader::open(&args[2]).expect("failed to open file");
#[allow(unused_variables)]
let hound::WavSpec {
channels,
sample_rate,
bits_per_sample,
..
} = reader.spec();
// Convert the audio to floating point samples.
let mut audio = whisper_rs::convert_integer_to_float_audio(
&reader
.samples::<i16>()
.map(|s| s.expect("invalid sample"))
.collect::<Vec<_>>(),
);
// Convert audio to 16KHz mono f32 samples, as required by the model.
// These utilities are provided for convenience, but can be replaced with custom conversion logic.
// SIMD variants of these functions are also available on nightly Rust (see the docs).
if channels == 2 {
audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?;
} else if channels != 1 {
panic!(">2 channels unsupported");
}
if sample_rate != 16000 {
panic!("sample rate must be 16KHz");
}
// Run the model.
state.full(params, &audio[..]).expect("failed to run model");
// Create a file to write the transcript to.
let mut file = File::create("transcript.txt").expect("failed to create file");
// Iterate through the segments of the transcript.
let num_segments = state
.full_n_segments()
.expect("failed to get number of segments");
for i in 0..num_segments {
// Get the transcribed text and timestamps for the current segment.
let segment = state
.full_get_segment_text(i)
.expect("failed to get segment");
let start_timestamp = state
.full_get_segment_t0(i)
.expect("failed to get start timestamp");
let end_timestamp = state
.full_get_segment_t1(i)
.expect("failed to get end timestamp");
// Print the segment to stdout.
println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
// Format the segment information as a string.
let line = format!("[{} - {}]: {}\n", start_timestamp, end_timestamp, segment);
// Write the segment information to the file.
file.write_all(line.as_bytes())
.expect("failed to write to file");
}
Ok(())
}

View File

@@ -0,0 +1,66 @@
#! /bin/bash
main() {
set -euo pipefail
input_wav="$(realpath "$1")"
model="$(realpath "${2:-../models/ggml-small.en.bin}")"
already_transcribed="${3:-false}"
sanitized_wav="${input_wav%.*}.mono-16khz.wav"
ffmpeg -y -i "$input_wav" -ac 1 -ar 16k "$sanitized_wav"
if ! $already_transcribed; then
pushd "$(dirname "$(realpath "$BASH_SOURCE")")"
cd ../gitea-whisper-rs/
cargo run --example wav_subtitles -- "$model" "$sanitized_wav"
popd
fi
out_to_srt ../gitea-whisper-rs/transcript.txt > "${input_wav%.*}.srt"
ffmpeg -y \
-loop 1 -i sc.jpg \
-i "$input_wav" \
-i "${input_wav%.*}.srt" \
-c:v libx264 \
-tune stillimage \
-pix_fmt yuv420p -shortest \
"${input_wav%.*}.mkv"
ls "${input_wav%.*}.mkv"
}
out_to_srt() {
cs_to_ts() {
echo "$1" | awk '{
printf "%02d:%02d:%02d,000",
int(($1/100.0)/60/60),
int(($1/100.0)/60%60),
int(($1/100.0)%60)
}'
}
cat "$1" \
| (
i=0
while read -r line; do
((i+=1))
echo "$i"
echo "$(cs_to_ts "$(
echo "${line%%:] *}" \
| tr -d '[' \
| awk '{print $1}'
)") --> $(cs_to_ts "$(
echo "${line%%:] *}" \
| tr -d '[' \
| awk '{print $3}'
)")"
echo "${line#*: }"
echo
done
)
}
if [ "$0" == "$BASH_SOURCE" ]; then
main "$@"
fi