import asyncio
import websockets
import sounddevice as sd
import numpy as np
import json
import os

model_id = ""  # Baseten model id here
baseten_api_key = os.environ["BASETEN_API_KEY"]

# Audio config
SAMPLE_RATE = 16000
CHUNK_SIZE = 512
CHUNK_DURATION = CHUNK_SIZE / SAMPLE_RATE
CHANNELS = 1

headers = {"Authorization": f"Api-Key {baseten_api_key}"}


# Metadata to send first
metadata = {
    "streaming_vad_config": {
        "threshold": 0.5,
        "min_silence_duration_ms": 300,
        "speech_pad_ms": 30
    },
    "streaming_params": {
        "encoding": "pcm_s16le",
        "sample_rate": SAMPLE_RATE,
        "enable_partial_transcripts": True
    },
    "whisper_params": {"audio_language": "en"},
}

async def stream_microphone_audio(ws_url):
    loop = asyncio.get_running_loop()
    async with websockets.connect(ws_url, extra_headers=headers) as ws:
        print("Connected to server")

        # Send the metadata JSON blob
        await ws.send(json.dumps(metadata))
        print("Sent metadata to server")

        send_queue = asyncio.Queue()

        # Start audio stream
        def audio_callback(indata, frames, time_info, status):
            if status:
                print(f"Audio warning: {status}")
            int16_data = (indata * 32767).astype(np.int16).tobytes()
            loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)

        with sd.InputStream(
                samplerate=SAMPLE_RATE,
                blocksize=CHUNK_SIZE,
                channels=CHANNELS,
                dtype="float32",
                callback=audio_callback,
        ):
            print("Streaming mic audio...")

            async def send_audio():
                while True:
                    chunk = await send_queue.get()
                    await ws.send(chunk)

            async def receive_server_messages():
                while True:
                    response = await ws.recv()
                    try:
                        message = json.loads(response)
                        is_final = message.get("is_final")
                        transcript = message.get("transcript")

                        if not is_final:
                            print(f"[partial] {transcript}")
                        elif is_final:
                            print(f"[final] {transcript}")
                        else:
                            print(f"[unknown type] {message}")
                    except Exception as e:
                        print("Non-JSON message or parse error:", response, "| Error:", str(e))

            # Run send + receive tasks concurrently
            await asyncio.gather(send_audio(), receive_server_messages())


# WebSocket endpoint URL
ws_url = f"wss://model-{model_id}.api.baseten.co/environments/production/websocket"

asyncio.run(stream_microphone_audio(ws_url))

{
  "is_final": true,
  "transcript": "That's one small step for man, one giant leap for mankind.",
  "confidence": 0.89,
  "language_code": "en",
  "language_prob": null,
  "segments": [
    {
      "text": "That's one small step for man, one giant leap for mankind.",
      "log_prob": -0.8644908666610718,
      "start_time": 0,
      "end_time": 9.92
    }
  ]
}
The streaming audio transcription endpoint is ONLY compatible with websockets not with the REST API. To begin using the transcription endpoint, establish a connection via WebSocket. Once connected, you must first send a metadata JSON object (as a string) over the WebSocket. This metadata informs the model about the format and type of audio data it should expect. After the metadata is sent, you can begin streaming raw audio bytes directly over the same WebSocket connection.
wss://model-{model_id}.api.baseten.co/environments/production/websocket

Parameters

model_id
string
required
The ID of the model you want to call.
Authorization
string
required
Your Baseten API key, formatted with prefix Api-Key (e.g. {"Authorization": "Api-Key abcd1234.abcd1234"}).

Websocket Metadata

streaming_vad_config
object
These parameters configure the Voice Activity Detector (VAD) and allow you to tune behavior such as speech endpointing.
  • threshold (float): The probability threshold for detecting speech, between 0.0 and 1.0. Frames with a probability above this value are considered speech. A higher threshold makes the VAD more selective, reducing false positives from background noise.
  • min_silence_duration_ms (int): The minimum duration of silence (in milliseconds) required to determine that speech has ended.
  • speech_pad_ms (int): Padding (in milliseconds) added to both the start and end of detected speech segments to avoid cutting off words prematurely.
streaming_params
object
Parameters for controlling streaming ASR behavior.
  • encoding (string, default="pcm_s16le"): Audio encoding format.
  • sample_rate (int, default="16000"): Audio sample rate in Hz. Whisper models are optimized for a sample rate of 16,000 Hz.
  • enable_partial_transcripts (boolean, optional): If set to true, intermediate (partial) transcripts will be sent over the WebSocket as audio is received. For most voice AI use cases, we recommend setting this to false.
  • partial_transcript_interval_s (float, default=0.5): Interval in seconds that the model waits before sending a partial transcript, if partials are enabled.
  • final_transcript_max_duration_s (int, default=30): The maximum duration of buffered audio (in seconds) before a final transcript is forcibly returned. This value should not exceed 30.
whisper_params
object
Parameters for controlling Whisper’s behavior.
  • prompt (string, optional): Optional transcription prompt.
  • audio_language (string, default="en"): Language of the input audio. Set to "auto" for automatic detection.
  • language_detection_only (boolean, default=false): If true, only return the automatic language detection result without transcribing.
  • language_options (list[string], default=[]): List of language codes to consider for language detection, for example ["en", "zh"]. This could improve language detection accuracy by scoping the language detection to a specific set of languages that only makes sense for your use case. By default, we consider all languages supported by Whisper model. [Added since v0.5.0]
  • use_dynamic_preprocessing (boolean, default=false): Enables dynamic range compression to process audio with variable loudness.
  • show_word_timestamps (boolean, default=false): If true, include word-level timestamps in the output. [Added since v0.4.0]
whisper_sampling_params
object
Advanced parameters for controlling Whisper’s sampling behavior.
  • beam_width (integer, optional): Beam search width for decoding. Controls the number of candidate sequences to maintain during beam search. [Added since v0.6.0]
  • length_penalty (float, optional): Length penalty applied to the output. Higher values encourage longer outputs. [Added since v0.6.0]
  • repetition_penalty (float, optional): Penalty for repeating tokens. Higher values discourage repetition. [Added since v0.6.0]
  • beam_search_diversity_rate (float, optional): Controls diversity in beam search. Higher values increase diversity among beam candidates. [Added since v0.6.0]
  • no_repeat_ngram_size (integer, optional): Prevents repetition of n-grams of the specified size. [Added since v0.6.0]
asr_options
object
Advanced settings for automatic speech recognition (ASR) process.
  • beam_size (integer, default=1): Beam search size for decoding. We support beam size up to 5. [Deprecated since v0.6.0. Use whisper_input.whisper_params.whisper_sampling_params.beam_width instead.]
  • length_penalty (float, default=2.0): Length penalty applied to ASR output. Length penalty can only work when beam_size is greater than 1. [Deprecated since v0.6.0. Use whisper_input.whisper_params.whisper_sampling_params.length_penalty instead.]
import asyncio
import websockets
import sounddevice as sd
import numpy as np
import json
import os

model_id = ""  # Baseten model id here
baseten_api_key = os.environ["BASETEN_API_KEY"]

# Audio config
SAMPLE_RATE = 16000
CHUNK_SIZE = 512
CHUNK_DURATION = CHUNK_SIZE / SAMPLE_RATE
CHANNELS = 1

headers = {"Authorization": f"Api-Key {baseten_api_key}"}


# Metadata to send first
metadata = {
    "streaming_vad_config": {
        "threshold": 0.5,
        "min_silence_duration_ms": 300,
        "speech_pad_ms": 30
    },
    "streaming_params": {
        "encoding": "pcm_s16le",
        "sample_rate": SAMPLE_RATE,
        "enable_partial_transcripts": True
    },
    "whisper_params": {"audio_language": "en"},
}

async def stream_microphone_audio(ws_url):
    loop = asyncio.get_running_loop()
    async with websockets.connect(ws_url, extra_headers=headers) as ws:
        print("Connected to server")

        # Send the metadata JSON blob
        await ws.send(json.dumps(metadata))
        print("Sent metadata to server")

        send_queue = asyncio.Queue()

        # Start audio stream
        def audio_callback(indata, frames, time_info, status):
            if status:
                print(f"Audio warning: {status}")
            int16_data = (indata * 32767).astype(np.int16).tobytes()
            loop.call_soon_threadsafe(send_queue.put_nowait, int16_data)

        with sd.InputStream(
                samplerate=SAMPLE_RATE,
                blocksize=CHUNK_SIZE,
                channels=CHANNELS,
                dtype="float32",
                callback=audio_callback,
        ):
            print("Streaming mic audio...")

            async def send_audio():
                while True:
                    chunk = await send_queue.get()
                    await ws.send(chunk)

            async def receive_server_messages():
                while True:
                    response = await ws.recv()
                    try:
                        message = json.loads(response)
                        is_final = message.get("is_final")
                        transcript = message.get("transcript")

                        if not is_final:
                            print(f"[partial] {transcript}")
                        elif is_final:
                            print(f"[final] {transcript}")
                        else:
                            print(f"[unknown type] {message}")
                    except Exception as e:
                        print("Non-JSON message or parse error:", response, "| Error:", str(e))

            # Run send + receive tasks concurrently
            await asyncio.gather(send_audio(), receive_server_messages())


# WebSocket endpoint URL
ws_url = f"wss://model-{model_id}.api.baseten.co/environments/production/websocket"

asyncio.run(stream_microphone_audio(ws_url))

{
  "is_final": true,
  "transcript": "That's one small step for man, one giant leap for mankind.",
  "confidence": 0.89,
  "language_code": "en",
  "language_prob": null,
  "segments": [
    {
      "text": "That's one small step for man, one giant leap for mankind.",
      "log_prob": -0.8644908666610718,
      "start_time": 0,
      "end_time": 9.92
    }
  ]
}