whisper.cpp from 73e33a182c3e608ac5947375b4ca3f8ee0ad4253 and patched to support

2023-11-26 16:59:22 -07:00
commit dd62f2b9f6
158 changed files with 75910 additions and 0 deletions
--- a/examples/audio_transcription.rs
+++ b/examples/audio_transcription.rs
@@ -0,0 +1,98 @@
+// This example is not going to build in this folder.
+// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml
+
+use hound;
+use std::fs::File;
+use std::io::Write;
+use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
+
+/// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout.
+fn main() -> Result<(), &'static str> {
+    // Load a context and model.
+    let ctx = WhisperContext::new("example/path/to/model/whisper.cpp/models/ggml-base.en.bin")
+        .expect("failed to load model");
+    // Create a state
+    let mut state = ctx.create_state().expect("failed to create key");
+
+    // Create a params object for running the model.
+    // The number of past samples to consider defaults to 0.
+    let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
+
+    // Edit params as needed.
+    // Set the number of threads to use to 1.
+    params.set_n_threads(1);
+    // Enable translation.
+    params.set_translate(true);
+    // Set the language to translate to to English.
+    params.set_language(Some("en"));
+    // Disable anything that prints to stdout.
+    params.set_print_special(false);
+    params.set_print_progress(false);
+    params.set_print_realtime(false);
+    params.set_print_timestamps(false);
+
+    // Open the audio file.
+    let mut reader = hound::WavReader::open("audio.wav").expect("failed to open file");
+    #[allow(unused_variables)]
+    let hound::WavSpec {
+        channels,
+        sample_rate,
+        bits_per_sample,
+        ..
+    } = reader.spec();
+
+    // Convert the audio to floating point samples.
+    let mut audio = whisper_rs::convert_integer_to_float_audio(
+        &reader
+            .samples::<i16>()
+            .map(|s| s.expect("invalid sample"))
+            .collect::<Vec<_>>(),
+    );
+
+    // Convert audio to 16KHz mono f32 samples, as required by the model.
+    // These utilities are provided for convenience, but can be replaced with custom conversion logic.
+    // SIMD variants of these functions are also available on nightly Rust (see the docs).
+    if channels == 2 {
+        audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?;
+    } else if channels != 1 {
+        panic!(">2 channels unsupported");
+    }
+
+    if sample_rate != 16000 {
+        panic!("sample rate must be 16KHz");
+    }
+
+    // Run the model.
+    state.full(params, &audio[..]).expect("failed to run model");
+
+    // Create a file to write the transcript to.
+    let mut file = File::create("transcript.txt").expect("failed to create file");
+
+    // Iterate through the segments of the transcript.
+    let num_segments = state
+        .full_n_segments()
+        .expect("failed to get number of segments");
+    for i in 0..num_segments {
+        // Get the transcribed text and timestamps for the current segment.
+        let segment = state
+            .full_get_segment_text(i)
+            .expect("failed to get segment");
+        let start_timestamp = state
+            .full_get_segment_t0(i)
+            .expect("failed to get start timestamp");
+        let end_timestamp = state
+            .full_get_segment_t1(i)
+            .expect("failed to get end timestamp");
+
+        // Print the segment to stdout.
+        println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
+
+        // Format the segment information as a string.
+        let line = format!("[{} - {}]: {}\n", start_timestamp, end_timestamp, segment);
+
+        // Write the segment information to the file.
+        file.write_all(line.as_bytes())
+            .expect("failed to write to file");
+    }
+    Ok(())
+}
--- a/examples/basic_use.rs
+++ b/examples/basic_use.rs
@@ -0,0 +1,73 @@
+#![allow(clippy::uninlined_format_args)]
+
+use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
+
+// note that running this example will not do anything, as it is just a
+// demonstration of how to use the library, and actual usage requires
+// more dependencies than the base library.
+pub fn usage() -> Result<(), &'static str> {
+    // load a context and model
+    let ctx = WhisperContext::new("path/to/model").expect("failed to load model");
+    // make a state
+    let mut state = ctx.create_state().expect("failed to create state");
+
+    // create a params object
+    // note that currently the only implemented strategy is Greedy, BeamSearch is a WIP
+    // n_past defaults to 0
+    let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
+
+    // edit things as needed
+    // here we set the number of threads to use to 1
+    params.set_n_threads(1);
+    // we also enable translation
+    params.set_translate(true);
+    // and set the language to translate to to english
+    params.set_language(Some("en"));
+    // we also explicitly disable anything that prints to stdout
+    params.set_print_special(false);
+    params.set_print_progress(false);
+    params.set_print_realtime(false);
+    params.set_print_timestamps(false);
+
+    // assume we have a buffer of audio data
+    // here we'll make a fake one, integer samples, 16 bit, 16KHz, stereo
+    let audio_data = vec![0_i16; 16000 * 2];
+
+    // we must convert to 16KHz mono f32 samples for the model
+    // some utilities exist for this
+    // note that you don't need to use these, you can do it yourself or any other way you want
+    // these are just provided for convenience
+    // SIMD variants of these functions are also available, but only on nightly Rust: see the docs
+    let audio_data = whisper_rs::convert_stereo_to_mono_audio(
+        &whisper_rs::convert_integer_to_float_audio(&audio_data),
+    )?;
+
+    // now we can run the model
+    // note the key we use here is the one we created above
+    state
+        .full(params, &audio_data[..])
+        .expect("failed to run model");
+
+    // fetch the results
+    let num_segments = state
+        .full_n_segments()
+        .expect("failed to get number of segments");
+    for i in 0..num_segments {
+        let segment = state
+            .full_get_segment_text(i)
+            .expect("failed to get segment");
+        let start_timestamp = state
+            .full_get_segment_t0(i)
+            .expect("failed to get segment start timestamp");
+        let end_timestamp = state
+            .full_get_segment_t1(i)
+            .expect("failed to get segment end timestamp");
+        println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
+    }
+
+    Ok(())
+}
+
+fn main() {
+    println!("running this example does nothing! see the source code for usage");
+}
--- a/examples/full_usage/2830-3980-0043.wav
+++ b/examples/full_usage/2830-3980-0043.wav
--- a/examples/full_usage/Cargo.toml
+++ b/examples/full_usage/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "full_usage"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+hound = "3"
+whisper-rs = { path = "../.." }
--- a/examples/full_usage/src/main.rs
+++ b/examples/full_usage/src/main.rs
@@ -0,0 +1,75 @@
+#![allow(clippy::uninlined_format_args)]
+
+use hound::{SampleFormat, WavReader};
+use std::path::Path;
+use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
+
+fn parse_wav_file(path: &Path) -> Vec<i16> {
+    let reader = WavReader::open(path).expect("failed to read file");
+
+    if reader.spec().channels != 1 {
+        panic!("expected mono audio file");
+    }
+    if reader.spec().sample_format != SampleFormat::Int {
+        panic!("expected integer sample format");
+    }
+    if reader.spec().sample_rate != 16000 {
+        panic!("expected 16KHz sample rate");
+    }
+    if reader.spec().bits_per_sample != 16 {
+        panic!("expected 16 bits per sample");
+    }
+
+    reader
+        .into_samples::<i16>()
+        .map(|x| x.expect("sample"))
+        .collect::<Vec<_>>()
+}
+
+fn main() {
+    let arg1 = std::env::args()
+        .nth(1)
+        .expect("first argument should be path to WAV file");
+    let audio_path = Path::new(&arg1);
+    if !audio_path.exists() {
+        panic!("audio file doesn't exist");
+    }
+    let arg2 = std::env::args()
+        .nth(2)
+        .expect("second argument should be path to Whisper model");
+    let whisper_path = Path::new(&arg2);
+    if !whisper_path.exists() {
+        panic!("whisper file doesn't exist")
+    }
+
+    let original_samples = parse_wav_file(audio_path);
+    let samples = whisper_rs::convert_integer_to_float_audio(&original_samples);
+
+    let ctx = WhisperContext::new(&whisper_path.to_string_lossy()).expect("failed to open model");
+    let mut state = ctx.create_state().expect("failed to create key");
+    let mut params = FullParams::new(SamplingStrategy::default());
+    params.set_progress_callback_safe(|progress| println!("Progress callback: {}%", progress));
+
+    let st = std::time::Instant::now();
+    state
+        .full(params, &samples)
+        .expect("failed to convert samples");
+    let et = std::time::Instant::now();
+
+    let num_segments = state
+        .full_n_segments()
+        .expect("failed to get number of segments");
+    for i in 0..num_segments {
+        let segment = state
+            .full_get_segment_text(i)
+            .expect("failed to get segment");
+        let start_timestamp = state
+            .full_get_segment_t0(i)
+            .expect("failed to get start timestamp");
+        let end_timestamp = state
+            .full_get_segment_t1(i)
+            .expect("failed to get end timestamp");
+        println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
+    }
+    println!("took {}ms", (et - st).as_millis());
+}