Input

Audio Format Requirements

ShunyaLabs accepts audio in the following format for livestream transcription:

ParameterRequirement
FormatFloat32 (IEEE 754), little-endian
Sample Rate16,000 Hz (16 kHz)
Channels1 (mono)
Data TypeFloat32 (32-bit floating point)
Chunk Size200–300 ms recommended
EncodingBase64 for transmission

Request Parameters

Initialize Session init

ParameterTypeDefaultDescription
actionstringRequiredMust be "send"
typestringRequiredMust be "init"
configobjectRequiredConfiguration options
config.languagestringnullLanguage code (e.g., 'en', 'es', 'fr'). Use null for auto-detection

Send Audio Frame frame

ParameterTypeDefaultDescription
actionstringRequiredMust be "send"
typestringRequiredMust be "frame"
frame_seqintegerRequiredSequential frame number starting from 0
audio_inline_b64stringRequiredBase64-encoded audio data

End Stream

To end the stream, send a final frame with audio_inline_b64 set to the base64 encoding of the literal string "END_OF_AUDIO". The base64 value is RU5EX09GX0FVRElP.

Complete JavaScript Example

Full implementation with audio capture from microphone.

class TranscriptionClient {
  constructor(apiEndpoint) {
    this.apiEndpoint = apiEndpoint;
    this.ws = null;
    this.frameSeq = 0;
    this.audioContext = null;
  }

  async connect() {
    return new Promise((resolve, reject) => {
      this.ws = new WebSocket(this.apiEndpoint);

      this.ws.onopen = () => {
        console.log('Connected to transcription service');
        this.initializeSession();
        resolve();
      };

      this.ws.onmessage = (event) => {
        this.handleTranscriptionResult(JSON.parse(event.data));
      };

      this.ws.onerror = (error) => {
        console.error('WebSocket error:', error);
        reject(error);
      };
    });
  }

  initializeSession() {
    this.ws.send(JSON.stringify({
      action: 'send',
      type: 'init',
      config: { language: 'en' }
    }));
  }

  async startRecording() {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      this.audioContext = new AudioContext({ sampleRate: 16000 });
      const source = this.audioContext.createMediaStreamSource(stream);

      const processor = this.audioContext.createScriptProcessor(4096, 1, 1);
      processor.onaudioprocess = (event) => {
        const audioData = event.inputBuffer.getChannelData(0);
        this.sendAudioFrame(audioData);
      };

      source.connect(processor);
      processor.connect(this.audioContext.destination);
    } catch (error) {
      console.error('Error starting recording:', error);
    }
  }

  sendAudioFrame(audioData) {
    if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;

    const base64Audio = this.arrayBufferToBase64(audioData.buffer);

    this.ws.send(JSON.stringify({
      action: 'send',
      type: 'frame',
      frame_seq: this.frameSeq++,
      audio_inline_b64: base64Audio
    }));
  }

  endStream() {
    const eos = btoa('END_OF_AUDIO');
    this.ws.send(JSON.stringify({
      action: 'send',
      type: 'frame',
      frame_seq: this.frameSeq++,
      audio_inline_b64: eos
    }));
  }

  handleTranscriptionResult(result) {
    if (result.segments) {
      result.segments.forEach(segment => {
        if (segment.completed) {
          console.log(`[${segment.start} s - ${segment.end}s]: ${segment.text}`);
        }
      });
    }
  }

  arrayBufferToBase64(buffer) {
    const bytes = new Uint8Array(buffer);
    let binary = '';
    for (let i = 0; i < bytes.byteLength; i++) {
      binary += String.fromCharCode(bytes[i]);
    }
    return btoa(binary);
  }
}

// Usage
const client = new TranscriptionClient('wss://tl.shunyalabs.ai/');

async function main() {
  await client.connect();
  await client.startRecording();

  setTimeout(() => {
    client.endStream();
  }, 30000);
}

main();

Complete Python Example

Full implementation with microphone audio capture and real-time transcription.

import websocket
import json
import base64
import numpy as np
import pyaudio
import threading
import time

class TranscriptionClient:
    def __init__(self, api_endpoint):
        self.api_endpoint = api_endpoint
        self.ws = None
        self.frame_seq = 0
        self.audio = pyaudio.PyAudio()
        self.stream = None
        self.recording = False

    def on_message(self, ws, message):
        result = json.loads(message)

        if 'segments' in result:
            for segment in result['segments']:
                if segment.get('completed', False):
                    print(f"[{segment['start']}s - {segment['end']}s]: {segment['text']}")

    def on_error(self, ws, error):
        print(f"Error: {error}")

    def on_close(self, ws, close_status_code, close_msg):
        print("Connection closed")

    def on_open(self, ws):
        print("Connected to transcription service")
        self.initialize_session()
        self.start_recording()

    def initialize_session(self):
        init_message = {
            "action": "send",
            "type": "init",
            "config": {
                "language": "en"
            }
        }

        self.ws.send(json.dumps(init_message))

    def start_recording(self):
        self.recording = True

        self.stream = self.audio.open(
            format=pyaudio.paFloat32,
            channels=1,
            rate=16000,
            input=True,
            frames_per_buffer=4096,
            stream_callback=self.audio_callback
        )

        self.stream.start_stream()

    def audio_callback(self, in_data, frame_count, time_info, status):
        if self.recording and self.ws:
            audio_data = np.frombuffer(in_data, dtype=np.float32)
            self.send_audio_frame(audio_data)

        return (in_data, pyaudio.paContinue)

    def send_audio_frame(self, audio_data):
        if not self.ws or self.ws.sock is None:
            return

        audio_bytes = audio_data.astype(np.float32).tobytes()
        base64_audio = base64.b64encode(audio_bytes).decode('utf-8')

        frame_message = {
            "action": "send",
            "type": "frame",
            "frame_seq": self.frame_seq,
            "audio_inline_b64": base64_audio
        }

        self.ws.send(json.dumps(frame_message))
        self.frame_seq += 1

    def stop_recording(self):
        self.recording = False
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()

    def end_session(self):
        end_of_audio = 'END_OF_AUDIO'.encode('utf-8')
        base64_sentinel = base64.b64encode(end_of_audio).decode('utf-8')

        frame_msg = {
            "action": "send",
            "type": "frame",
            "frame_seq": self.frame_seq,
            "audio_inline_b64": base64_sentinel
        }
        self.ws.send(json.dumps(frame_msg))

    def connect(self):
        self.ws = websocket.WebSocketApp(
            self.api_endpoint,
            on_open=self.on_open,
            on_message=self.on_message,
            on_error=self.on_error,
            on_close=self.on_close
        )

        wst = threading.Thread(target=self.ws.run_forever)
        wst.daemon = True
        wst.start()

        return wst

# Usage example
if __name__ == '__main__':
    client = TranscriptionClient('wss://tl.shunyalabs.ai/')

    try:
        client.connect()

        while True:
            time.sleep(1)

    except KeyboardInterrupt:
        print('Stopping...')
        client.stop_recording()
        client.end_session()