QVAC Logo
How-to guides

Transcription

Automatic speech recognition (ASR) for speech-to-text — i.e., generate text transcriptions from audio input.

Overview

Transcription uses your choice of either whisper.cpp or NVIDIA Parakeet (via ONNX Runtime) as inference engine. Load a model using modelType: "whisper" for whisper.cpp, or modelType: "parakeet" for Parakeet. Parakeet supports multilingual transcription (TDT), english-only transcription (CTC), and speaker diarization (Sortformer).

Provide audio input as audioChunk, either as a file path (string) or an in-memory audio buffer.

transcribe() returns the full transcription as a single string. If you need partial results as they become available, use transcribeStream() to receive text chunks in real-time.

Functions

Use the following sequence of function calls:

  1. loadModel()
  2. transcribe() or transcribeStream()
  3. unloadModel()

For how to use each function, see SDK — API reference.

Models

whisper.cpp

You should load two models:

  • a whisper.cpp-compatible model for transcription. Model file format: *.bin; and
  • a VAD model (e.g., Silero) converted to GGML. Model file format: *.bin (optional, recommended).

Parakeet

Parakeet models consist of multiple ONNX files. The required files depend on the model variant:

  • TDT (multilingual, ~25 languages): encoder, encoder data, decoder, vocabulary, and preprocessor files.
  • CTC (english-only): model, model data, and tokenizer files.
  • Sortformer (speaker diarization): a single model file.

Pass the model variant via modelConfig.modelType ("tdt", "ctc", or "sortformer") and provide the corresponding source fields in modelConfig.

For models available as constants, see SDK — Models.

Examples

whisper.cpp

The following script shows an example of whisper.cpp transcription with prompt-guided decoding, VAD, and GPU acceleration:

whispercpp-prompt.js
import { loadModel, unloadModel, transcribe, WHISPER_TINY } from "@qvac/sdk";
try {
    console.log("🎤 Starting Whisper transcription with prompt example...");
    // Load the Whisper model
    console.log("📥 Loading Whisper model...");
    const modelId = await loadModel({
        modelSrc: WHISPER_TINY,
        modelType: "whisper",
        modelConfig: {
            audio_format: "f32le",
            // Sampling strategy
            strategy: "greedy",
            n_threads: 4,
            // Transcription options
            language: "en",
            translate: false,
            no_timestamps: false,
            single_segment: false,
            print_timestamps: true,
            token_timestamps: true,
            // Quality settings
            temperature: 0.0,
            suppress_blank: true,
            suppress_nst: true,
            // Advanced tuning
            entropy_thold: 2.4,
            logprob_thold: -1.0,
            // VAD configuration
            vad_params: {
                threshold: 0.35,
                min_speech_duration_ms: 200,
                min_silence_duration_ms: 150,
                max_speech_duration_s: 30.0,
                speech_pad_ms: 600,
                samples_overlap: 0.3,
            },
            // Context parameters for GPU
            contextParams: {
                use_gpu: true,
                flash_attn: true,
                gpu_device: 0,
            },
        },
        onProgress: (progress) => {
            console.log(progress);
        },
    });
    console.log(`✅ Whisper model loaded with ID: ${modelId}`);
    // Perform transcription
    console.log("🎧 Transcribing audio...");
    const text = await transcribe({
        modelId,
        audioChunk: "examples/audio/sample-16khz.wav",
        prompt: "This is a test recording with clear speech and proper punctuation.",
    });
    console.log("📝 Transcription result:");
    console.log(text);
    // Unload the model when done
    console.log("🧹 Unloading Whisper model...");
    await unloadModel({ modelId });
    console.log("✅ Whisper model unloaded successfully");
    process.exit(0);
}
catch (error) {
    console.error("❌ Error:", error);
    process.exit(1);
}

Parakeet TDT

The following script shows an example of multilingual transcription using the Parakeet TDT model from a WAV file:

parakeet-tdt-filesystem.js
import { loadModel, unloadModel, transcribe, PARAKEET_TDT_ENCODER_FP32, PARAKEET_TDT_ENCODER_DATA_FP32, PARAKEET_TDT_DECODER_FP32, PARAKEET_TDT_VOCAB, PARAKEET_TDT_PREPROCESSOR_FP32, } from "@qvac/sdk";
const args = process.argv.slice(2);
if (!args[0]) {
    console.error("Usage: bun run examples/transcription/parakeet-tdt-filesystem.ts <wav-file-path> " +
        "[encoder-onnx] [encoder-data] [decoder-onnx] [vocab-txt] [preprocessor-onnx]");
    console.error("\nIf model paths are omitted, defaults to registry models.");
    process.exit(1);
}
const audioFilePath = args[0];
const parakeetEncoderSrc = args[1] ?? PARAKEET_TDT_ENCODER_FP32;
const parakeetEncoderDataSrc = args[2] ?? PARAKEET_TDT_ENCODER_DATA_FP32;
const parakeetDecoderSrc = args[3] ?? PARAKEET_TDT_DECODER_FP32;
const parakeetVocabSrc = args[4] ?? PARAKEET_TDT_VOCAB;
const parakeetPreprocessorSrc = args[5] ?? PARAKEET_TDT_PREPROCESSOR_FP32;
try {
    console.log("Starting Parakeet transcription example...");
    console.log("Loading Parakeet model...");
    const modelId = await loadModel({
        modelSrc: parakeetEncoderSrc,
        modelType: "parakeet",
        modelConfig: {
            parakeetEncoderSrc,
            parakeetEncoderDataSrc,
            parakeetDecoderSrc,
            parakeetVocabSrc,
            parakeetPreprocessorSrc,
        },
        onProgress: (progress) => {
            console.log(`Download progress: ${progress.percentage.toFixed(1)}%`);
        },
    });
    console.log(`Parakeet model loaded with ID: ${modelId}`);
    console.log("Transcribing audio...");
    const text = await transcribe({ modelId, audioChunk: audioFilePath });
    console.log("Transcription result:");
    console.log(text);
    console.log("Unloading Parakeet model...");
    await unloadModel({ modelId });
    console.log("Parakeet model unloaded successfully");
}
catch (error) {
    console.error("Error:", error);
    process.exit(1);
}

Parakeet CTC

The following script shows an example of English-only transcription using the Parakeet CTC model from a WAV file:

parakeet-ctc-filesystem.js
import { loadModel, unloadModel, transcribe, PARAKEET_CTC_FP32, PARAKEET_CTC_DATA_FP32, PARAKEET_CTC_TOKENIZER, } from "@qvac/sdk";
const args = process.argv.slice(2);
if (!args[0]) {
    console.error("Usage: bun run examples/transcription/parakeet-ctc-filesystem.ts <wav-file> " +
        "[model.onnx] [model.onnx_data] [tokenizer.json]");
    console.error("\nIf model paths are omitted, defaults to registry models.");
    process.exit(1);
}
const audioFilePath = args[0];
const parakeetCtcModelSrc = args[1] ?? PARAKEET_CTC_FP32;
const parakeetCtcModelDataSrc = args[2] ?? PARAKEET_CTC_DATA_FP32;
const parakeetTokenizerSrc = args[3] ?? PARAKEET_CTC_TOKENIZER;
try {
    console.log("Loading Parakeet CTC model...");
    const modelId = await loadModel({
        modelSrc: parakeetCtcModelSrc,
        modelType: "parakeet",
        modelConfig: {
            modelType: "ctc",
            parakeetCtcModelSrc,
            parakeetCtcModelDataSrc,
            parakeetTokenizerSrc,
        },
        onProgress: (progress) => {
            console.log(`Download progress: ${progress.percentage.toFixed(1)}%`);
        },
    });
    console.log(`Parakeet CTC model loaded with ID: ${modelId}`);
    console.log("Transcribing audio...");
    const text = await transcribe({ modelId, audioChunk: audioFilePath });
    console.log("Transcription result:");
    console.log(text);
    console.log("Unloading model...");
    await unloadModel({ modelId });
    console.log("Done");
}
catch (error) {
    console.error("Error:", error);
    process.exit(1);
}

Parakeet Sortformer

The following script shows an example of speaker diarization using the Parakeet Sortformer model, followed by per-segment transcription with the TDT model:

parakeet-sortformer.js
import { loadModel, unloadModel, transcribe, PARAKEET_TDT_ENCODER_FP32, PARAKEET_TDT_ENCODER_DATA_FP32, PARAKEET_TDT_DECODER_FP32, PARAKEET_TDT_VOCAB, PARAKEET_TDT_PREPROCESSOR_FP32, PARAKEET_SORTFORMER_FP32, } from "@qvac/sdk";
import { dirname, join } from "path";
import { fileURLToPath } from "url";
import { readFileSync, writeFileSync, mkdirSync } from "fs";
import { tmpdir } from "os";
const __dirname = dirname(fileURLToPath(import.meta.url));
const args = process.argv.slice(2);
const sortformerSrc = args[0] ?? PARAKEET_SORTFORMER_FP32;
const defaultAudioPath = join(__dirname, "..", "..", "..", "examples", "transcription", "audio", "diarization-sample-16k.wav");
const audioFilePath = args[1] ?? defaultAudioPath;
// ── Step 1: Diarize with Sortformer ──
const sfModelId = await loadModel({
    modelSrc: sortformerSrc,
    modelType: "parakeet",
    modelConfig: {
        modelType: "sortformer",
        parakeetSortformerSrc: sortformerSrc,
    },
});
const diarization = await transcribe({
    modelId: sfModelId,
    audioChunk: audioFilePath,
});
await unloadModel({ modelId: sfModelId });
const segments = parseDiarization(diarization);
// ── Step 2: Transcribe each segment with TDT ──
const tdtModelId = await loadModel({
    modelSrc: PARAKEET_TDT_ENCODER_FP32,
    modelType: "parakeet",
    modelConfig: {
        parakeetEncoderSrc: PARAKEET_TDT_ENCODER_FP32,
        parakeetEncoderDataSrc: PARAKEET_TDT_ENCODER_DATA_FP32,
        parakeetDecoderSrc: PARAKEET_TDT_DECODER_FP32,
        parakeetVocabSrc: PARAKEET_TDT_VOCAB,
        parakeetPreprocessorSrc: PARAKEET_TDT_PREPROCESSOR_FP32,
    },
});
const pcm = readPcm(audioFilePath);
const sliceDir = join(tmpdir(), `qvac-diarize-${Date.now()}`);
mkdirSync(sliceDir, { recursive: true });
const results = [];
for (let i = 0; i < segments.length; i++) {
    const seg = segments[i];
    const slicePath = join(sliceDir, `seg-${i}.wav`);
    if (!writeWavSlice(pcm, seg.start, seg.end, slicePath)) {
        results.push({ ...seg, text: "[No speech detected]" });
        continue;
    }
    const text = await transcribe({ modelId: tdtModelId, audioChunk: slicePath });
    results.push({ ...seg, text: text.trim() || "[No speech detected]" });
}
await unloadModel({ modelId: tdtModelId });
// ── Step 3: Merge consecutive same-speaker segments and print ──
const merged = mergeSpeakers(results);
console.log("\n=== DIARIZED TRANSCRIPTION ===");
console.log("=".repeat(60));
for (const entry of merged) {
    console.log(`Speaker ${entry.speaker} (${entry.start.toFixed(2)}s - ${entry.end.toFixed(2)}s):`);
    console.log(`  ${entry.text}\n`);
}
console.log("=".repeat(60));
console.log("\nDone!");
// ── Helpers ──
function parseDiarization(text) {
    const segs = [];
    for (const line of text.split("\n")) {
        const m = line.match(/Speaker (\d+): ([\d.]+)s - ([\d.]+)s/);
        if (m)
            segs.push({ speaker: +m[1], start: +m[2], end: +m[3] });
    }
    return segs.sort((a, b) => a.start - b.start);
}
function readPcm(wavPath) {
    const buf = readFileSync(wavPath);
    const dataOffset = buf.indexOf("data") + 4;
    return buf.subarray(dataOffset + 4, dataOffset + 4 + buf.readUInt32LE(dataOffset));
}
function writeWavSlice(pcm, startSec, endSec, outPath) {
    const SR = 16000;
    const BPS = 2;
    const startByte = Math.floor(startSec * SR) * BPS;
    const endByte = Math.min(Math.ceil(endSec * SR) * BPS, pcm.length);
    if (startByte >= endByte)
        return false;
    const slice = pcm.subarray(startByte, endByte);
    const hdr = Buffer.alloc(44);
    hdr.write("RIFF", 0);
    hdr.writeUInt32LE(36 + slice.length, 4);
    hdr.write("WAVEfmt ", 8);
    hdr.writeUInt32LE(16, 16);
    hdr.writeUInt16LE(1, 20);
    hdr.writeUInt16LE(1, 22);
    hdr.writeUInt32LE(SR, 24);
    hdr.writeUInt32LE(SR * BPS, 28);
    hdr.writeUInt16LE(BPS, 32);
    hdr.writeUInt16LE(16, 34);
    hdr.write("data", 36);
    hdr.writeUInt32LE(slice.length, 40);
    writeFileSync(outPath, Buffer.concat([hdr, slice]));
    return true;
}
function mergeSpeakers(entries) {
    const out = [];
    for (const e of entries) {
        const last = out[out.length - 1];
        if (last && last.speaker === e.speaker) {
            last.text += " " + e.text;
            last.end = e.end;
        }
        else {
            out.push({ ...e });
        }
    }
    return out;
}

Tip: all examples throughout this documentation are self-contained and runnable. For instructions on how to run them, see SDK quickstart.

On this page