whisper.cpp from 73e33a182c3e608ac5947375b4ca3f8ee0ad4253 and patched to support
This commit is contained in:
98
examples/audio_transcription.rs
Normal file
98
examples/audio_transcription.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
// This example is not going to build in this folder.
|
||||
// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml
|
||||
|
||||
use hound;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
|
||||
|
||||
/// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout.
|
||||
fn main() -> Result<(), &'static str> {
|
||||
// Load a context and model.
|
||||
let ctx = WhisperContext::new("example/path/to/model/whisper.cpp/models/ggml-base.en.bin")
|
||||
.expect("failed to load model");
|
||||
// Create a state
|
||||
let mut state = ctx.create_state().expect("failed to create key");
|
||||
|
||||
// Create a params object for running the model.
|
||||
// The number of past samples to consider defaults to 0.
|
||||
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 });
|
||||
|
||||
// Edit params as needed.
|
||||
// Set the number of threads to use to 1.
|
||||
params.set_n_threads(1);
|
||||
// Enable translation.
|
||||
params.set_translate(true);
|
||||
// Set the language to translate to to English.
|
||||
params.set_language(Some("en"));
|
||||
// Disable anything that prints to stdout.
|
||||
params.set_print_special(false);
|
||||
params.set_print_progress(false);
|
||||
params.set_print_realtime(false);
|
||||
params.set_print_timestamps(false);
|
||||
|
||||
// Open the audio file.
|
||||
let mut reader = hound::WavReader::open("audio.wav").expect("failed to open file");
|
||||
#[allow(unused_variables)]
|
||||
let hound::WavSpec {
|
||||
channels,
|
||||
sample_rate,
|
||||
bits_per_sample,
|
||||
..
|
||||
} = reader.spec();
|
||||
|
||||
// Convert the audio to floating point samples.
|
||||
let mut audio = whisper_rs::convert_integer_to_float_audio(
|
||||
&reader
|
||||
.samples::<i16>()
|
||||
.map(|s| s.expect("invalid sample"))
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
|
||||
// Convert audio to 16KHz mono f32 samples, as required by the model.
|
||||
// These utilities are provided for convenience, but can be replaced with custom conversion logic.
|
||||
// SIMD variants of these functions are also available on nightly Rust (see the docs).
|
||||
if channels == 2 {
|
||||
audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?;
|
||||
} else if channels != 1 {
|
||||
panic!(">2 channels unsupported");
|
||||
}
|
||||
|
||||
if sample_rate != 16000 {
|
||||
panic!("sample rate must be 16KHz");
|
||||
}
|
||||
|
||||
// Run the model.
|
||||
state.full(params, &audio[..]).expect("failed to run model");
|
||||
|
||||
// Create a file to write the transcript to.
|
||||
let mut file = File::create("transcript.txt").expect("failed to create file");
|
||||
|
||||
// Iterate through the segments of the transcript.
|
||||
let num_segments = state
|
||||
.full_n_segments()
|
||||
.expect("failed to get number of segments");
|
||||
for i in 0..num_segments {
|
||||
// Get the transcribed text and timestamps for the current segment.
|
||||
let segment = state
|
||||
.full_get_segment_text(i)
|
||||
.expect("failed to get segment");
|
||||
let start_timestamp = state
|
||||
.full_get_segment_t0(i)
|
||||
.expect("failed to get start timestamp");
|
||||
let end_timestamp = state
|
||||
.full_get_segment_t1(i)
|
||||
.expect("failed to get end timestamp");
|
||||
|
||||
// Print the segment to stdout.
|
||||
println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
|
||||
|
||||
// Format the segment information as a string.
|
||||
let line = format!("[{} - {}]: {}\n", start_timestamp, end_timestamp, segment);
|
||||
|
||||
// Write the segment information to the file.
|
||||
file.write_all(line.as_bytes())
|
||||
.expect("failed to write to file");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
73
examples/basic_use.rs
Normal file
73
examples/basic_use.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
#![allow(clippy::uninlined_format_args)]
|
||||
|
||||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
|
||||
|
||||
// note that running this example will not do anything, as it is just a
|
||||
// demonstration of how to use the library, and actual usage requires
|
||||
// more dependencies than the base library.
|
||||
pub fn usage() -> Result<(), &'static str> {
|
||||
// load a context and model
|
||||
let ctx = WhisperContext::new("path/to/model").expect("failed to load model");
|
||||
// make a state
|
||||
let mut state = ctx.create_state().expect("failed to create state");
|
||||
|
||||
// create a params object
|
||||
// note that currently the only implemented strategy is Greedy, BeamSearch is a WIP
|
||||
// n_past defaults to 0
|
||||
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
|
||||
|
||||
// edit things as needed
|
||||
// here we set the number of threads to use to 1
|
||||
params.set_n_threads(1);
|
||||
// we also enable translation
|
||||
params.set_translate(true);
|
||||
// and set the language to translate to to english
|
||||
params.set_language(Some("en"));
|
||||
// we also explicitly disable anything that prints to stdout
|
||||
params.set_print_special(false);
|
||||
params.set_print_progress(false);
|
||||
params.set_print_realtime(false);
|
||||
params.set_print_timestamps(false);
|
||||
|
||||
// assume we have a buffer of audio data
|
||||
// here we'll make a fake one, integer samples, 16 bit, 16KHz, stereo
|
||||
let audio_data = vec![0_i16; 16000 * 2];
|
||||
|
||||
// we must convert to 16KHz mono f32 samples for the model
|
||||
// some utilities exist for this
|
||||
// note that you don't need to use these, you can do it yourself or any other way you want
|
||||
// these are just provided for convenience
|
||||
// SIMD variants of these functions are also available, but only on nightly Rust: see the docs
|
||||
let audio_data = whisper_rs::convert_stereo_to_mono_audio(
|
||||
&whisper_rs::convert_integer_to_float_audio(&audio_data),
|
||||
)?;
|
||||
|
||||
// now we can run the model
|
||||
// note the key we use here is the one we created above
|
||||
state
|
||||
.full(params, &audio_data[..])
|
||||
.expect("failed to run model");
|
||||
|
||||
// fetch the results
|
||||
let num_segments = state
|
||||
.full_n_segments()
|
||||
.expect("failed to get number of segments");
|
||||
for i in 0..num_segments {
|
||||
let segment = state
|
||||
.full_get_segment_text(i)
|
||||
.expect("failed to get segment");
|
||||
let start_timestamp = state
|
||||
.full_get_segment_t0(i)
|
||||
.expect("failed to get segment start timestamp");
|
||||
let end_timestamp = state
|
||||
.full_get_segment_t1(i)
|
||||
.expect("failed to get segment end timestamp");
|
||||
println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
println!("running this example does nothing! see the source code for usage");
|
||||
}
|
||||
BIN
examples/full_usage/2830-3980-0043.wav
Normal file
BIN
examples/full_usage/2830-3980-0043.wav
Normal file
Binary file not shown.
10
examples/full_usage/Cargo.toml
Normal file
10
examples/full_usage/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "full_usage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
hound = "3"
|
||||
whisper-rs = { path = "../.." }
|
||||
75
examples/full_usage/src/main.rs
Normal file
75
examples/full_usage/src/main.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
#![allow(clippy::uninlined_format_args)]
|
||||
|
||||
use hound::{SampleFormat, WavReader};
|
||||
use std::path::Path;
|
||||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext};
|
||||
|
||||
fn parse_wav_file(path: &Path) -> Vec<i16> {
|
||||
let reader = WavReader::open(path).expect("failed to read file");
|
||||
|
||||
if reader.spec().channels != 1 {
|
||||
panic!("expected mono audio file");
|
||||
}
|
||||
if reader.spec().sample_format != SampleFormat::Int {
|
||||
panic!("expected integer sample format");
|
||||
}
|
||||
if reader.spec().sample_rate != 16000 {
|
||||
panic!("expected 16KHz sample rate");
|
||||
}
|
||||
if reader.spec().bits_per_sample != 16 {
|
||||
panic!("expected 16 bits per sample");
|
||||
}
|
||||
|
||||
reader
|
||||
.into_samples::<i16>()
|
||||
.map(|x| x.expect("sample"))
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let arg1 = std::env::args()
|
||||
.nth(1)
|
||||
.expect("first argument should be path to WAV file");
|
||||
let audio_path = Path::new(&arg1);
|
||||
if !audio_path.exists() {
|
||||
panic!("audio file doesn't exist");
|
||||
}
|
||||
let arg2 = std::env::args()
|
||||
.nth(2)
|
||||
.expect("second argument should be path to Whisper model");
|
||||
let whisper_path = Path::new(&arg2);
|
||||
if !whisper_path.exists() {
|
||||
panic!("whisper file doesn't exist")
|
||||
}
|
||||
|
||||
let original_samples = parse_wav_file(audio_path);
|
||||
let samples = whisper_rs::convert_integer_to_float_audio(&original_samples);
|
||||
|
||||
let ctx = WhisperContext::new(&whisper_path.to_string_lossy()).expect("failed to open model");
|
||||
let mut state = ctx.create_state().expect("failed to create key");
|
||||
let mut params = FullParams::new(SamplingStrategy::default());
|
||||
params.set_progress_callback_safe(|progress| println!("Progress callback: {}%", progress));
|
||||
|
||||
let st = std::time::Instant::now();
|
||||
state
|
||||
.full(params, &samples)
|
||||
.expect("failed to convert samples");
|
||||
let et = std::time::Instant::now();
|
||||
|
||||
let num_segments = state
|
||||
.full_n_segments()
|
||||
.expect("failed to get number of segments");
|
||||
for i in 0..num_segments {
|
||||
let segment = state
|
||||
.full_get_segment_text(i)
|
||||
.expect("failed to get segment");
|
||||
let start_timestamp = state
|
||||
.full_get_segment_t0(i)
|
||||
.expect("failed to get start timestamp");
|
||||
let end_timestamp = state
|
||||
.full_get_segment_t1(i)
|
||||
.expect("failed to get end timestamp");
|
||||
println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment);
|
||||
}
|
||||
println!("took {}ms", (et - st).as_millis());
|
||||
}
|
||||
Reference in New Issue
Block a user