OpenAI announced some new audio models yesterday, including new transcription models(gpt-4o-transcribe
and gpt-4o-mini-transcribe
). This model performs better than Whisper, though because they are built on top of language models, they're prone to prompt injection attacks.
You can also access it using the Realtime API. After a bit of work(the docs had an error) I got a standalone Python script working that streams audio from your microphone and transcribes it.
# /// script
# dependencies = [
# "requests<3",
# "rich",
# "websockets==13.1",
# "pyaudio",
# ]
# ///
# Usage: uv run openai_transcribe.py
import asyncio
import base64
import json
import os
import websockets
import pyaudio
async def receive_messages(ws):
try:
while True:
message = await ws.recv()
data = json.loads(message)
print(data)
if data.get("type") == "input_audio_buffer.speech_started":
print("Speech started")
elif data.get("type") == "input_audio_buffer.speech_stopped":
print("Speech stopped")
elif data.get("type") == "conversation.item.input_audio_transcription.partial":
print(f"Partial: {data.get('transcript')}")
elif data.get("type") == "conversation.item.input_audio_transcription.completed":
print(f"Complete: {data.get('transcript')}")
else:
print(f"Other event: {data.get('type')}")
except asyncio.CancelledError:
print("Receiver task cancelled")
except Exception as e:
print(f"Error in receiver: {e}")
async def transcribe_from_microphone():
# Connect to OpenAI's WebSocket
uri = "wss://api.openai.com/v1/realtime?intent=transcription"
headers = {
"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
"openai-beta": "realtime=v1",
}
# Initialize PyAudio
p = pyaudio.PyAudio()
# Configure audio stream
stream = p.open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
input=True,
frames_per_buffer=int(24000 * 0.02), # 20ms chunks
start=False
)
async with websockets.connect(uri, extra_headers=headers) as ws:
print("Connected to OpenAI WebSocket")
# Configure transcription session
config = {
"type": "transcription_session.update",
"session": {
"input_audio_format": "pcm16",
"input_audio_transcription": {
"model": "gpt-4o-mini-transcribe",
"prompt": "",
"language": "en"
},
"turn_detection": {
"type": "semantic_vad"
},
"input_audio_noise_reduction": {
"type": "near_field"
},
}
}
await ws.send(json.dumps(config))
print("Sent configuration")
# Create a task to receive messages
receive_task = asyncio.create_task(receive_messages(ws))
# Start recording
stream.start_stream()
print("Listening... (Press Ctrl+C to stop)")
try:
# Keep sending audio chunks
while True:
audio_data = stream.read(int(24000 * 0.02))
payload = base64.b64encode(audio_data).decode("utf-8")
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": payload,
}))
await asyncio.sleep(0.01) # Small delay to prevent overwhelming
except KeyboardInterrupt:
print("\nStopping recording...")
finally:
# Clean up
stream.stop_stream()
stream.close()
p.terminate()
receive_task.cancel()
print("Recording stopped")
if __name__ == "__main__":
if not os.getenv("OPENAI_API_KEY"):
print("Error: OPENAI_API_KEY environment variable not set")
print("Please set your OpenAI API key: export OPENAI_API_KEY=your_key_here")
exit(1)
print("Starting microphone transcription...")
try:
asyncio.run(transcribe_from_microphone())
except KeyboardInterrupt:
print("\nProgram terminated by user")