Input

Audio Format Requirements

ShunyaLabs accepts audio in the following format for livestream transcription:

Parameter	Requirement
Format	Float32 (IEEE 754), little-endian
Sample Rate	16,000 Hz (16 kHz)
Channels	1 (mono)
Data Type	Float32 (32-bit floating point)
Chunk Size	200–300 ms recommended
Encoding	Base64 for transmission

Request Parameters

Initialize Session `init`

Parameter	Type	Default	Description
action	string	Required	Must be "send"
type	string	Required	Must be "init"
config	object	Required	Configuration options
config.language	string	null	Language code (e.g., 'en', 'es', 'fr'). Use null for auto-detection

Send Audio Frame `frame`

Parameter	Type	Default	Description
action	string	Required	Must be "send"
type	string	Required	Must be "frame"
frame_seq	integer	Required	Sequential frame number starting from 0
audio_inline_b64	string	Required	Base64-encoded audio data

End Stream

To end the stream, send a final frame with audio_inline_b64 set to the base64 encoding of the literal string "END_OF_AUDIO". The base64 value is RU5EX09GX0FVRElP.

Complete JavaScript Example

Full implementation with audio capture from microphone.

class TranscriptionClient {
  constructor(apiEndpoint) {
    this.apiEndpoint = apiEndpoint;
    this.ws = null;
    this.frameSeq = 0;
    this.audioContext = null;
  }

  async connect() {
    return new Promise((resolve, reject) => {
      this.ws = new WebSocket(this.apiEndpoint);

      this.ws.onopen = () => {
        console.log('Connected to transcription service');
        this.initializeSession();
        resolve();
      };

      this.ws.onmessage = (event) => {
        this.handleTranscriptionResult(JSON.parse(event.data));
      };

      this.ws.onerror = (error) => {
        console.error('WebSocket error:', error);
        reject(error);
      };
    });
  }

  initializeSession() {
    this.ws.send(JSON.stringify({
      action: 'send',
      type: 'init',
      config: { language: 'en' }
    }));
  }

  async startRecording() {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      this.audioContext = new AudioContext({ sampleRate: 16000 });
      const source = this.audioContext.createMediaStreamSource(stream);

      const processor = this.audioContext.createScriptProcessor(4096, 1, 1);
      processor.onaudioprocess = (event) => {
        const audioData = event.inputBuffer.getChannelData(0);
        this.sendAudioFrame(audioData);
      };

      source.connect(processor);
      processor.connect(this.audioContext.destination);
    } catch (error) {
      console.error('Error starting recording:', error);
    }
  }

  sendAudioFrame(audioData) {
    if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;

    const base64Audio = this.arrayBufferToBase64(audioData.buffer);

    this.ws.send(JSON.stringify({
      action: 'send',
      type: 'frame',
      frame_seq: this.frameSeq++,
      audio_inline_b64: base64Audio
    }));
  }

  endStream() {
    const eos = btoa('END_OF_AUDIO');
    this.ws.send(JSON.stringify({
      action: 'send',
      type: 'frame',
      frame_seq: this.frameSeq++,
      audio_inline_b64: eos
    }));
  }

  handleTranscriptionResult(result) {
    if (result.segments) {
      result.segments.forEach(segment => {
        if (segment.completed) {
          console.log(`[${segment.start} s - ${segment.end}s]: ${segment.text}`);
        }
      });
    }
  }

  arrayBufferToBase64(buffer) {
    const bytes = new Uint8Array(buffer);
    let binary = '';
    for (let i = 0; i < bytes.byteLength; i++) {
      binary += String.fromCharCode(bytes[i]);
    }
    return btoa(binary);
  }
}

// Usage
const client = new TranscriptionClient('wss://tl.shunyalabs.ai/');

async function main() {
  await client.connect();
  await client.startRecording();

  setTimeout(() => {
    client.endStream();
  }, 30000);
}

main();

Complete Python Example

Full implementation with microphone audio capture and real-time transcription.

import websocket
import json
import base64
import numpy as np
import pyaudio
import threading
import time

class TranscriptionClient:
    def __init__(self, api_endpoint):
        self.api_endpoint = api_endpoint
        self.ws = None
        self.frame_seq = 0
        self.audio = pyaudio.PyAudio()
        self.stream = None
        self.recording = False

    def on_message(self, ws, message):
        result = json.loads(message)

        if 'segments' in result:
            for segment in result['segments']:
                if segment.get('completed', False):
                    print(f"[{segment['start']}s - {segment['end']}s]: {segment['text']}")

    def on_error(self, ws, error):
        print(f"Error: {error}")

    def on_close(self, ws, close_status_code, close_msg):
        print("Connection closed")

    def on_open(self, ws):
        print("Connected to transcription service")
        self.initialize_session()
        self.start_recording()

    def initialize_session(self):
        init_message = {
            "action": "send",
            "type": "init",
            "config": {
                "language": "en"
            }
        }

        self.ws.send(json.dumps(init_message))

    def start_recording(self):
        self.recording = True

        self.stream = self.audio.open(
            format=pyaudio.paFloat32,
            channels=1,
            rate=16000,
            input=True,
            frames_per_buffer=4096,
            stream_callback=self.audio_callback
        )

        self.stream.start_stream()

    def audio_callback(self, in_data, frame_count, time_info, status):
        if self.recording and self.ws:
            audio_data = np.frombuffer(in_data, dtype=np.float32)
            self.send_audio_frame(audio_data)

        return (in_data, pyaudio.paContinue)

    def send_audio_frame(self, audio_data):
        if not self.ws or self.ws.sock is None:
            return

        audio_bytes = audio_data.astype(np.float32).tobytes()
        base64_audio = base64.b64encode(audio_bytes).decode('utf-8')

        frame_message = {
            "action": "send",
            "type": "frame",
            "frame_seq": self.frame_seq,
            "audio_inline_b64": base64_audio
        }

        self.ws.send(json.dumps(frame_message))
        self.frame_seq += 1

    def stop_recording(self):
        self.recording = False
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()

    def end_session(self):
        end_of_audio = 'END_OF_AUDIO'.encode('utf-8')
        base64_sentinel = base64.b64encode(end_of_audio).decode('utf-8')

        frame_msg = {
            "action": "send",
            "type": "frame",
            "frame_seq": self.frame_seq,
            "audio_inline_b64": base64_sentinel
        }
        self.ws.send(json.dumps(frame_msg))

    def connect(self):
        self.ws = websocket.WebSocketApp(
            self.api_endpoint,
            on_open=self.on_open,
            on_message=self.on_message,
            on_error=self.on_error,
            on_close=self.on_close
        )

        wst = threading.Thread(target=self.ws.run_forever)
        wst.daemon = True
        wst.start()

        return wst

# Usage example
if __name__ == '__main__':
    client = TranscriptionClient('wss://tl.shunyalabs.ai/')

    try:
        client.connect()

        while True:
            time.sleep(1)

    except KeyboardInterrupt:
        print('Stopping...')
        client.stop_recording()
        client.end_session()