Skip to main content
Cartesia provides ultra-low latency text-to-speech (Sonic models) and speech-to-text (Ink models) capabilities. With first-byte latency as low as 40ms for TTS and real-time streaming for STT, Cartesia is ideal for conversational AI. Netra helps you track both TTS and STT operations, monitor performance metrics, and analyze usage patterns.

Installation

pip install cartesia netra-sdk

Usage

Initialize Netra before using Cartesia:
import os
from netra import Netra

Netra.init(
    app_name="cartesia-service",
    headers=f"x-api-key={os.environ.get('NETRA_API_KEY')}"
)

Examples

Text-to-Speech with Sonic Models

Track Cartesia TTS operations using Netra decorators:
from cartesia import Cartesia
from netra import task, workflow
import os

client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))

@task()
def generate_speech(text: str, voice_id: str) -> bytes:
    """Generate speech using Cartesia Sonic model."""
    response = client.tts.bytes(
        model_id="sonic-english",
        transcript=text,
        voice={
            "mode": "id",
            "id": voice_id
        },
        output_format={
            "container": "raw",
            "encoding": "pcm_f32le",
            "sample_rate": 44100
        }
    )
    
    return response

@task()
def stream_speech(text: str, voice_id: str):
    """Stream speech using Cartesia Sonic Turbo."""
    stream = client.tts.stream(
        model_id="sonic-turbo",
        transcript=text,
        voice={
            "mode": "id",
            "id": voice_id
        },
        output_format={
            "container": "raw",
            "encoding": "pcm_f32le",
            "sample_rate": 44100
        }
    )
    
    for chunk in stream:
        yield chunk

@workflow()
def process_text_batch(texts: list[str], voice_id: str) -> list[bytes]:
    """Process multiple texts to speech."""
    audio_buffers = []
    
    for text in texts:
        audio = generate_speech(text, voice_id)
        audio_buffers.append(audio)
    
    return audio_buffers

# Usage
audio_data = generate_speech(
    "Hello, this is Cartesia Sonic speech synthesis.",
    "a0e99841-438c-4a64-b679-ae501e7d6091"
)

Speech-to-Text with Ink Models

Track Cartesia STT operations using Netra decorators:
from cartesia import Cartesia
from netra import task
import os

client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))

@task()
def transcribe_audio(audio_path: str) -> str:
    """Transcribe audio using Cartesia Ink model."""
    with open(audio_path, "rb") as audio_file:
        audio_data = audio_file.read()
    
    response = client.stt.transcribe(
        model_id="ink-whisper",
        audio=audio_data,
        language="en"
    )
    
    return response.transcript

@task()
def transcribe_stream(audio_stream) -> str:
    """Transcribe streaming audio in real-time."""
    full_transcript = ""
    
    stream = client.stt.stream_transcribe(
        model_id="ink-whisper",
        audio=audio_stream,
        language="en",
        interim_results=True
    )
    
    for result in stream:
        if result.is_final:
            full_transcript += result.transcript + " "
    
    return full_transcript.strip()

# Usage
transcript = transcribe_audio("./audio/sample.wav")

Manual Span Creation with Action Tracking

For detailed control over tracing with both TTS and STT:
from cartesia import Cartesia
from netra import SpanWrapper, ActionModel, UsageModel
import os
import time

client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))

def generate_speech_with_tracking(text: str, voice_id: str) -> bytes:
    """Generate speech with detailed tracking."""
    span = SpanWrapper("cartesia-tts")
    span.start()
    
    try:
        start_time = time.time_ns()
        span.set_attribute("text_length", len(text))
        span.set_attribute("voice_id", voice_id)
        span.set_attribute("model", "sonic-turbo")
        
        response = client.tts.bytes(
            model_id="sonic-turbo",
            transcript=text,
            voice={"mode": "id", "id": voice_id},
            output_format={
                "container": "raw",
                "encoding": "pcm_f32le",
                "sample_rate": 44100
            }
        )
        
        end_time = time.time_ns()
        duration_ms = (end_time - start_time) / 1_000_000
        
        # Track the TTS API operation
        action = ActionModel(
            start_time=str(start_time),
            action="API",
            action_type="TTS_SYNTHESIS",
            metadata={
                "provider": "cartesia",
                "model": "sonic-turbo",
                "voice_id": voice_id,
                "text_length": str(len(text)),
                "audio_size_bytes": str(len(response)),
                "sample_rate": "44100",
                "latency_ms": str(duration_ms)
            },
            success=True
        )
        span.set_action([action])
        
        # Track usage
        usage = UsageModel(
            model="sonic-turbo",
            usage_type="characters",
            units_used=len(text),
            cost_in_usd=len(text) * 0.00001
        )
        span.set_usage([usage])
        
        span.set_status({"code": 1, "message": "Success"})
        span.end()
        
        return response
        
    except Exception as e:
        span.set_error(e)
        span.set_status({"code": 2, "message": "Error"})
        span.end()
        raise

def transcribe_with_tracking(audio_path: str) -> str:
    """Transcribe audio with detailed tracking."""
    span = SpanWrapper("cartesia-stt")
    span.start()
    
    try:
        start_time = time.time_ns()
        
        with open(audio_path, "rb") as audio_file:
            audio_data = audio_file.read()
        
        audio_size_bytes = len(audio_data)
        span.set_attribute("audio_file", audio_path)
        span.set_attribute("audio_size_bytes", audio_size_bytes)
        span.set_attribute("model", "ink-whisper")
        
        response = client.stt.transcribe(
            model_id="ink-whisper",
            audio=audio_data,
            language="en"
        )
        
        end_time = time.time_ns()
        duration_ms = (end_time - start_time) / 1_000_000
        
        # Track the STT API operation
        action = ActionModel(
            start_time=str(start_time),
            action="API",
            action_type="STT_TRANSCRIPTION",
            metadata={
                "provider": "cartesia",
                "model": "ink-whisper",
                "audio_size_bytes": str(audio_size_bytes),
                "transcript_length": str(len(response.transcript)),
                "duration_ms": str(duration_ms),
                "language": "en"
            },
            success=True
        )
        span.set_action([action])
        
        # Track usage
        audio_duration = getattr(response, 'duration', 0)
        usage = UsageModel(
            model="ink-whisper",
            usage_type="audio_seconds",
            units_used=audio_duration,
            cost_in_usd=audio_duration * 0.036  # $0.13 per hour
        )
        span.set_usage([usage])
        
        span.set_attribute("transcript_length", len(response.transcript))
        span.set_status({"code": 1, "message": "Success"})
        span.end()
        
        return response.transcript
        
    except Exception as e:
        span.set_error(e)
        span.set_status({"code": 2, "message": "Error"})
        span.end()
        raise

# Usage
audio_data = generate_speech_with_tracking(
    "This is ultra-low latency speech synthesis.",
    "a0e99841-438c-4a64-b679-ae501e7d6091"
)

transcript = transcribe_with_tracking("./audio/sample.wav")

Next Steps

Last modified on January 30, 2026