Standalone OpenAI transcription Python script

OpenAI announced some new audio models yesterday, including new transcription models(gpt-4o-transcribe and gpt-4o-mini-transcribe). This model performs better than Whisper, though because they are built on top of language models, they're prone to prompt injection attacks.

You can also access it using the Realtime API. After a bit of work(the docs had an error) I got a standalone Python script working that streams audio from your microphone and transcribes it.

# /// script
# dependencies = [
#   "requests<3",
#   "rich",
#   "websockets==13.1",
#   "pyaudio",
# ]
# ///

# Usage: uv run openai_transcribe.py

import asyncio
import base64
import json
import os
import websockets
import pyaudio

async def receive_messages(ws):
    try:
        while True:
            message = await ws.recv()
            data = json.loads(message)
            print(data)
            
            if data.get("type") == "input_audio_buffer.speech_started":
                print("Speech started")
                
            elif data.get("type") == "input_audio_buffer.speech_stopped":
                print("Speech stopped")
                
            elif data.get("type") == "conversation.item.input_audio_transcription.partial":
                print(f"Partial: {data.get('transcript')}")
                
            elif data.get("type") == "conversation.item.input_audio_transcription.completed":
                print(f"Complete: {data.get('transcript')}")
                
            else:
                print(f"Other event: {data.get('type')}")
    except asyncio.CancelledError:
        print("Receiver task cancelled")
    except Exception as e:
        print(f"Error in receiver: {e}")

async def transcribe_from_microphone():
    # Connect to OpenAI's WebSocket
    uri = "wss://api.openai.com/v1/realtime?intent=transcription"

    headers = {
        "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
        "openai-beta": "realtime=v1",
    }
    
    # Initialize PyAudio
    p = pyaudio.PyAudio()
    
    # Configure audio stream
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=24000,
        input=True,
        frames_per_buffer=int(24000 * 0.02),  # 20ms chunks
        start=False
    )
    
    async with websockets.connect(uri, extra_headers=headers) as ws:
        print("Connected to OpenAI WebSocket")
        
        # Configure transcription session
        config = {
            "type": "transcription_session.update",
            "session": {
                "input_audio_format": "pcm16",
                "input_audio_transcription": {
                    "model": "gpt-4o-mini-transcribe",
                    "prompt": "",
                    "language": "en"
                },
                "turn_detection": {
                    "type": "semantic_vad"
                },
                "input_audio_noise_reduction": {
                    "type": "near_field"
                },
            }
        }
        
        await ws.send(json.dumps(config))
        print("Sent configuration")
        
        # Create a task to receive messages
        receive_task = asyncio.create_task(receive_messages(ws))
        
        # Start recording
        stream.start_stream()
        print("Listening... (Press Ctrl+C to stop)")

        try:
            # Keep sending audio chunks
            while True:
                audio_data = stream.read(int(24000 * 0.02))
                payload = base64.b64encode(audio_data).decode("utf-8")
                await ws.send(json.dumps({
                    "type": "input_audio_buffer.append",
                    "audio": payload,
                }))
                await asyncio.sleep(0.01)  # Small delay to prevent overwhelming
        except KeyboardInterrupt:
            print("\nStopping recording...")
        finally:
            # Clean up
            stream.stop_stream()
            stream.close()
            p.terminate()
            receive_task.cancel()
            print("Recording stopped")

if __name__ == "__main__":
    if not os.getenv("OPENAI_API_KEY"):
        print("Error: OPENAI_API_KEY environment variable not set")
        print("Please set your OpenAI API key: export OPENAI_API_KEY=your_key_here")
        exit(1)
    
    print("Starting microphone transcription...")
    try:
        asyncio.run(transcribe_from_microphone())
    except KeyboardInterrupt:
        print("\nProgram terminated by user")