Input
Audio Format Requirements
ShunyaLabs accepts audio in the following format for livestream transcription:
| Parameter | Requirement |
|---|---|
| Format | Float32 (IEEE 754), little-endian |
| Sample Rate | 16,000 Hz (16 kHz) |
| Channels | 1 (mono) |
| Data Type | Float32 (32-bit floating point) |
| Chunk Size | 200–300 ms recommended |
| Encoding | Base64 for transmission |
Request Parameters
Initialize Session init
| Parameter | Type | Default | Description |
|---|---|---|---|
| action | string | Required | Must be "send" |
| type | string | Required | Must be "init" |
| config | object | Required | Configuration options |
| config.language | string | null | Language code (e.g., 'en', 'es', 'fr'). Use null for auto-detection |
Send Audio Frame frame
| Parameter | Type | Default | Description |
|---|---|---|---|
| action | string | Required | Must be "send" |
| type | string | Required | Must be "frame" |
| frame_seq | integer | Required | Sequential frame number starting from 0 |
| audio_inline_b64 | string | Required | Base64-encoded audio data |
End Stream
To end the stream, send a final frame with audio_inline_b64 set to the base64 encoding of the literal string "END_OF_AUDIO". The base64 value is RU5EX09GX0FVRElP.
Complete JavaScript Example
Full implementation with audio capture from microphone.
class TranscriptionClient {
constructor(apiEndpoint) {
this.apiEndpoint = apiEndpoint;
this.ws = null;
this.frameSeq = 0;
this.audioContext = null;
}
async connect() {
return new Promise((resolve, reject) => {
this.ws = new WebSocket(this.apiEndpoint);
this.ws.onopen = () => {
console.log('Connected to transcription service');
this.initializeSession();
resolve();
};
this.ws.onmessage = (event) => {
this.handleTranscriptionResult(JSON.parse(event.data));
};
this.ws.onerror = (error) => {
console.error('WebSocket error:', error);
reject(error);
};
});
}
initializeSession() {
this.ws.send(JSON.stringify({
action: 'send',
type: 'init',
config: { language: 'en' }
}));
}
async startRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
this.audioContext = new AudioContext({ sampleRate: 16000 });
const source = this.audioContext.createMediaStreamSource(stream);
const processor = this.audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (event) => {
const audioData = event.inputBuffer.getChannelData(0);
this.sendAudioFrame(audioData);
};
source.connect(processor);
processor.connect(this.audioContext.destination);
} catch (error) {
console.error('Error starting recording:', error);
}
}
sendAudioFrame(audioData) {
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
const base64Audio = this.arrayBufferToBase64(audioData.buffer);
this.ws.send(JSON.stringify({
action: 'send',
type: 'frame',
frame_seq: this.frameSeq++,
audio_inline_b64: base64Audio
}));
}
endStream() {
const eos = btoa('END_OF_AUDIO');
this.ws.send(JSON.stringify({
action: 'send',
type: 'frame',
frame_seq: this.frameSeq++,
audio_inline_b64: eos
}));
}
handleTranscriptionResult(result) {
if (result.segments) {
result.segments.forEach(segment => {
if (segment.completed) {
console.log(`[${segment.start} s - ${segment.end}s]: ${segment.text}`);
}
});
}
}
arrayBufferToBase64(buffer) {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.byteLength; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
}
// Usage
const client = new TranscriptionClient('wss://tl.shunyalabs.ai/');
async function main() {
await client.connect();
await client.startRecording();
setTimeout(() => {
client.endStream();
}, 30000);
}
main();Complete Python Example
Full implementation with microphone audio capture and real-time transcription.
import websocket
import json
import base64
import numpy as np
import pyaudio
import threading
import time
class TranscriptionClient:
def __init__(self, api_endpoint):
self.api_endpoint = api_endpoint
self.ws = None
self.frame_seq = 0
self.audio = pyaudio.PyAudio()
self.stream = None
self.recording = False
def on_message(self, ws, message):
result = json.loads(message)
if 'segments' in result:
for segment in result['segments']:
if segment.get('completed', False):
print(f"[{segment['start']}s - {segment['end']}s]: {segment['text']}")
def on_error(self, ws, error):
print(f"Error: {error}")
def on_close(self, ws, close_status_code, close_msg):
print("Connection closed")
def on_open(self, ws):
print("Connected to transcription service")
self.initialize_session()
self.start_recording()
def initialize_session(self):
init_message = {
"action": "send",
"type": "init",
"config": {
"language": "en"
}
}
self.ws.send(json.dumps(init_message))
def start_recording(self):
self.recording = True
self.stream = self.audio.open(
format=pyaudio.paFloat32,
channels=1,
rate=16000,
input=True,
frames_per_buffer=4096,
stream_callback=self.audio_callback
)
self.stream.start_stream()
def audio_callback(self, in_data, frame_count, time_info, status):
if self.recording and self.ws:
audio_data = np.frombuffer(in_data, dtype=np.float32)
self.send_audio_frame(audio_data)
return (in_data, pyaudio.paContinue)
def send_audio_frame(self, audio_data):
if not self.ws or self.ws.sock is None:
return
audio_bytes = audio_data.astype(np.float32).tobytes()
base64_audio = base64.b64encode(audio_bytes).decode('utf-8')
frame_message = {
"action": "send",
"type": "frame",
"frame_seq": self.frame_seq,
"audio_inline_b64": base64_audio
}
self.ws.send(json.dumps(frame_message))
self.frame_seq += 1
def stop_recording(self):
self.recording = False
if self.stream:
self.stream.stop_stream()
self.stream.close()
def end_session(self):
end_of_audio = 'END_OF_AUDIO'.encode('utf-8')
base64_sentinel = base64.b64encode(end_of_audio).decode('utf-8')
frame_msg = {
"action": "send",
"type": "frame",
"frame_seq": self.frame_seq,
"audio_inline_b64": base64_sentinel
}
self.ws.send(json.dumps(frame_msg))
def connect(self):
self.ws = websocket.WebSocketApp(
self.api_endpoint,
on_open=self.on_open,
on_message=self.on_message,
on_error=self.on_error,
on_close=self.on_close
)
wst = threading.Thread(target=self.ws.run_forever)
wst.daemon = True
wst.start()
return wst
# Usage example
if __name__ == '__main__':
client = TranscriptionClient('wss://tl.shunyalabs.ai/')
try:
client.connect()
while True:
time.sleep(1)
except KeyboardInterrupt:
print('Stopping...')
client.stop_recording()
client.end_session()