Transcription
Automatic speech recognition (ASR) for speech-to-text — i.e., generate text transcriptions from audio input.
Overview
Transcription uses your choice of either whisper.cpp or NVIDIA Parakeet (via ONNX Runtime) as inference engine. Load a model using modelType: "whisper" for whisper.cpp, or modelType: "parakeet" for Parakeet. Parakeet supports multilingual transcription (TDT), english-only transcription (CTC), and speaker diarization (Sortformer).
Provide audio input as audioChunk, either as a file path (string) or an in-memory audio buffer.
transcribe() returns the full transcription as a single string. If you need partial results as they become available, use transcribeStream() to receive text chunks in real-time.
Functions
Use the following sequence of function calls:
For how to use each function, see SDK — API reference.
Models
whisper.cpp
You should load two models:
- a
whisper.cpp-compatible model for transcription. Model file format:*.bin; and - a VAD model (e.g., Silero) converted to GGML. Model file format:
*.bin(optional, recommended).
Parakeet
Parakeet models consist of multiple ONNX files. The required files depend on the model variant:
- TDT (multilingual, ~25 languages): encoder, encoder data, decoder, vocabulary, and preprocessor files.
- CTC (english-only): model, model data, and tokenizer files.
- Sortformer (speaker diarization): a single model file.
Pass the model variant via modelConfig.modelType ("tdt", "ctc", or "sortformer") and provide the corresponding source fields in modelConfig.
For models available as constants, see SDK — Models.
Examples
whisper.cpp
The following script shows an example of whisper.cpp transcription with prompt-guided decoding, VAD, and GPU acceleration:
import { loadModel, unloadModel, transcribe, WHISPER_TINY } from "@qvac/sdk";
try {
console.log("🎤 Starting Whisper transcription with prompt example...");
// Load the Whisper model
console.log("📥 Loading Whisper model...");
const modelId = await loadModel({
modelSrc: WHISPER_TINY,
modelType: "whisper",
modelConfig: {
audio_format: "f32le",
// Sampling strategy
strategy: "greedy",
n_threads: 4,
// Transcription options
language: "en",
translate: false,
no_timestamps: false,
single_segment: false,
print_timestamps: true,
token_timestamps: true,
// Quality settings
temperature: 0.0,
suppress_blank: true,
suppress_nst: true,
// Advanced tuning
entropy_thold: 2.4,
logprob_thold: -1.0,
// VAD configuration
vad_params: {
threshold: 0.35,
min_speech_duration_ms: 200,
min_silence_duration_ms: 150,
max_speech_duration_s: 30.0,
speech_pad_ms: 600,
samples_overlap: 0.3,
},
// Context parameters for GPU
contextParams: {
use_gpu: true,
flash_attn: true,
gpu_device: 0,
},
},
onProgress: (progress) => {
console.log(progress);
},
});
console.log(`✅ Whisper model loaded with ID: ${modelId}`);
// Perform transcription
console.log("🎧 Transcribing audio...");
const text = await transcribe({
modelId,
audioChunk: "examples/audio/sample-16khz.wav",
prompt: "This is a test recording with clear speech and proper punctuation.",
});
console.log("📝 Transcription result:");
console.log(text);
// Unload the model when done
console.log("🧹 Unloading Whisper model...");
await unloadModel({ modelId });
console.log("✅ Whisper model unloaded successfully");
process.exit(0);
}
catch (error) {
console.error("❌ Error:", error);
process.exit(1);
}Parakeet TDT
The following script shows an example of multilingual transcription using the Parakeet TDT model from a WAV file:
import { loadModel, unloadModel, transcribe, PARAKEET_TDT_ENCODER_FP32, PARAKEET_TDT_ENCODER_DATA_FP32, PARAKEET_TDT_DECODER_FP32, PARAKEET_TDT_VOCAB, PARAKEET_TDT_PREPROCESSOR_FP32, } from "@qvac/sdk";
const args = process.argv.slice(2);
if (!args[0]) {
console.error("Usage: bun run examples/transcription/parakeet-tdt-filesystem.ts <wav-file-path> " +
"[encoder-onnx] [encoder-data] [decoder-onnx] [vocab-txt] [preprocessor-onnx]");
console.error("\nIf model paths are omitted, defaults to registry models.");
process.exit(1);
}
const audioFilePath = args[0];
const parakeetEncoderSrc = args[1] ?? PARAKEET_TDT_ENCODER_FP32;
const parakeetEncoderDataSrc = args[2] ?? PARAKEET_TDT_ENCODER_DATA_FP32;
const parakeetDecoderSrc = args[3] ?? PARAKEET_TDT_DECODER_FP32;
const parakeetVocabSrc = args[4] ?? PARAKEET_TDT_VOCAB;
const parakeetPreprocessorSrc = args[5] ?? PARAKEET_TDT_PREPROCESSOR_FP32;
try {
console.log("Starting Parakeet transcription example...");
console.log("Loading Parakeet model...");
const modelId = await loadModel({
modelSrc: parakeetEncoderSrc,
modelType: "parakeet",
modelConfig: {
parakeetEncoderSrc,
parakeetEncoderDataSrc,
parakeetDecoderSrc,
parakeetVocabSrc,
parakeetPreprocessorSrc,
},
onProgress: (progress) => {
console.log(`Download progress: ${progress.percentage.toFixed(1)}%`);
},
});
console.log(`Parakeet model loaded with ID: ${modelId}`);
console.log("Transcribing audio...");
const text = await transcribe({ modelId, audioChunk: audioFilePath });
console.log("Transcription result:");
console.log(text);
console.log("Unloading Parakeet model...");
await unloadModel({ modelId });
console.log("Parakeet model unloaded successfully");
}
catch (error) {
console.error("Error:", error);
process.exit(1);
}Parakeet CTC
The following script shows an example of English-only transcription using the Parakeet CTC model from a WAV file:
import { loadModel, unloadModel, transcribe, PARAKEET_CTC_FP32, PARAKEET_CTC_DATA_FP32, PARAKEET_CTC_TOKENIZER, } from "@qvac/sdk";
const args = process.argv.slice(2);
if (!args[0]) {
console.error("Usage: bun run examples/transcription/parakeet-ctc-filesystem.ts <wav-file> " +
"[model.onnx] [model.onnx_data] [tokenizer.json]");
console.error("\nIf model paths are omitted, defaults to registry models.");
process.exit(1);
}
const audioFilePath = args[0];
const parakeetCtcModelSrc = args[1] ?? PARAKEET_CTC_FP32;
const parakeetCtcModelDataSrc = args[2] ?? PARAKEET_CTC_DATA_FP32;
const parakeetTokenizerSrc = args[3] ?? PARAKEET_CTC_TOKENIZER;
try {
console.log("Loading Parakeet CTC model...");
const modelId = await loadModel({
modelSrc: parakeetCtcModelSrc,
modelType: "parakeet",
modelConfig: {
modelType: "ctc",
parakeetCtcModelSrc,
parakeetCtcModelDataSrc,
parakeetTokenizerSrc,
},
onProgress: (progress) => {
console.log(`Download progress: ${progress.percentage.toFixed(1)}%`);
},
});
console.log(`Parakeet CTC model loaded with ID: ${modelId}`);
console.log("Transcribing audio...");
const text = await transcribe({ modelId, audioChunk: audioFilePath });
console.log("Transcription result:");
console.log(text);
console.log("Unloading model...");
await unloadModel({ modelId });
console.log("Done");
}
catch (error) {
console.error("Error:", error);
process.exit(1);
}Parakeet Sortformer
The following script shows an example of speaker diarization using the Parakeet Sortformer model, followed by per-segment transcription with the TDT model:
import { loadModel, unloadModel, transcribe, PARAKEET_TDT_ENCODER_FP32, PARAKEET_TDT_ENCODER_DATA_FP32, PARAKEET_TDT_DECODER_FP32, PARAKEET_TDT_VOCAB, PARAKEET_TDT_PREPROCESSOR_FP32, PARAKEET_SORTFORMER_FP32, } from "@qvac/sdk";
import { dirname, join } from "path";
import { fileURLToPath } from "url";
import { readFileSync, writeFileSync, mkdirSync } from "fs";
import { tmpdir } from "os";
const __dirname = dirname(fileURLToPath(import.meta.url));
const args = process.argv.slice(2);
const sortformerSrc = args[0] ?? PARAKEET_SORTFORMER_FP32;
const defaultAudioPath = join(__dirname, "..", "..", "..", "examples", "transcription", "audio", "diarization-sample-16k.wav");
const audioFilePath = args[1] ?? defaultAudioPath;
// ── Step 1: Diarize with Sortformer ──
const sfModelId = await loadModel({
modelSrc: sortformerSrc,
modelType: "parakeet",
modelConfig: {
modelType: "sortformer",
parakeetSortformerSrc: sortformerSrc,
},
});
const diarization = await transcribe({
modelId: sfModelId,
audioChunk: audioFilePath,
});
await unloadModel({ modelId: sfModelId });
const segments = parseDiarization(diarization);
// ── Step 2: Transcribe each segment with TDT ──
const tdtModelId = await loadModel({
modelSrc: PARAKEET_TDT_ENCODER_FP32,
modelType: "parakeet",
modelConfig: {
parakeetEncoderSrc: PARAKEET_TDT_ENCODER_FP32,
parakeetEncoderDataSrc: PARAKEET_TDT_ENCODER_DATA_FP32,
parakeetDecoderSrc: PARAKEET_TDT_DECODER_FP32,
parakeetVocabSrc: PARAKEET_TDT_VOCAB,
parakeetPreprocessorSrc: PARAKEET_TDT_PREPROCESSOR_FP32,
},
});
const pcm = readPcm(audioFilePath);
const sliceDir = join(tmpdir(), `qvac-diarize-${Date.now()}`);
mkdirSync(sliceDir, { recursive: true });
const results = [];
for (let i = 0; i < segments.length; i++) {
const seg = segments[i];
const slicePath = join(sliceDir, `seg-${i}.wav`);
if (!writeWavSlice(pcm, seg.start, seg.end, slicePath)) {
results.push({ ...seg, text: "[No speech detected]" });
continue;
}
const text = await transcribe({ modelId: tdtModelId, audioChunk: slicePath });
results.push({ ...seg, text: text.trim() || "[No speech detected]" });
}
await unloadModel({ modelId: tdtModelId });
// ── Step 3: Merge consecutive same-speaker segments and print ──
const merged = mergeSpeakers(results);
console.log("\n=== DIARIZED TRANSCRIPTION ===");
console.log("=".repeat(60));
for (const entry of merged) {
console.log(`Speaker ${entry.speaker} (${entry.start.toFixed(2)}s - ${entry.end.toFixed(2)}s):`);
console.log(` ${entry.text}\n`);
}
console.log("=".repeat(60));
console.log("\nDone!");
// ── Helpers ──
function parseDiarization(text) {
const segs = [];
for (const line of text.split("\n")) {
const m = line.match(/Speaker (\d+): ([\d.]+)s - ([\d.]+)s/);
if (m)
segs.push({ speaker: +m[1], start: +m[2], end: +m[3] });
}
return segs.sort((a, b) => a.start - b.start);
}
function readPcm(wavPath) {
const buf = readFileSync(wavPath);
const dataOffset = buf.indexOf("data") + 4;
return buf.subarray(dataOffset + 4, dataOffset + 4 + buf.readUInt32LE(dataOffset));
}
function writeWavSlice(pcm, startSec, endSec, outPath) {
const SR = 16000;
const BPS = 2;
const startByte = Math.floor(startSec * SR) * BPS;
const endByte = Math.min(Math.ceil(endSec * SR) * BPS, pcm.length);
if (startByte >= endByte)
return false;
const slice = pcm.subarray(startByte, endByte);
const hdr = Buffer.alloc(44);
hdr.write("RIFF", 0);
hdr.writeUInt32LE(36 + slice.length, 4);
hdr.write("WAVEfmt ", 8);
hdr.writeUInt32LE(16, 16);
hdr.writeUInt16LE(1, 20);
hdr.writeUInt16LE(1, 22);
hdr.writeUInt32LE(SR, 24);
hdr.writeUInt32LE(SR * BPS, 28);
hdr.writeUInt16LE(BPS, 32);
hdr.writeUInt16LE(16, 34);
hdr.write("data", 36);
hdr.writeUInt32LE(slice.length, 40);
writeFileSync(outPath, Buffer.concat([hdr, slice]));
return true;
}
function mergeSpeakers(entries) {
const out = [];
for (const e of entries) {
const last = out[out.length - 1];
if (last && last.speaker === e.speaker) {
last.text += " " + e.text;
last.end = e.end;
}
else {
out.push({ ...e });
}
}
return out;
}Tip: all examples throughout this documentation are self-contained and runnable. For instructions on how to run them, see SDK quickstart.