Skip to main content
The Netra SDK exposes an evaluation client that lets you:
  • Manage datasets - Create datasets and add test items
  • Run test suites - Execute tasks against datasets with automatic tracing
  • Apply evaluators - Score outputs using built-in or custom evaluators
This page shows how to use Netra.evaluation to manage datasets, run test suites, and programmatically evaluate your AI applications.

Getting Started

The evaluation client is available on the main Netra entry point after initialization.
from netra import Netra

Netra.init(app_name="sample-app")

# Access the evaluation client
Netra.evaluation.create_dataset(...)
Netra.evaluation.add_dataset_item(...)
Netra.evaluation.get_dataset(...)
Netra.evaluation.run_test_suite(...)

create_dataset

Create an empty dataset that can hold test items for evaluation runs.
from netra import Netra
from netra.evaluation import TurnType

Netra.init(app_name="sample-app")

result = Netra.evaluation.create_dataset(
    name="Customer Support QA",
    tags=["support", "qa", "v1"],
    turn_type=TurnType.SINGLE,  # or TurnType.MULTI for multi-turn
)

print(f"Dataset created: {result.id}")
print(f"Name: {result.name}")
print(f"Tags: {result.tags}")

Parameters

ParameterTypeDescription
namestrName of the dataset (required)
tagslist[str]?Optional tags for categorization
turn_typeTurnTypeSINGLE for single-turn or MULTI for multi-turn datasets

Response: CreateDatasetResponse

FieldTypeDescription
idstrUnique dataset identifier
namestrDataset name
tagslist[str]Associated tags
project_idstrProject identifier
organization_idstrOrganization identifier
created_bystrCreator identifier
updated_bystrLast updater identifier
created_atstrCreation timestamp
updated_atstrLast update timestamp
deleted_atstr?Deletion timestamp (if soft-deleted)
ValueDescription
TurnType.SINGLESingle-turn evaluation (one input → one output)
TurnType.MULTIMulti-turn evaluation (conversation sequences)

add_dataset_item

Add a single test item to an existing dataset.
from netra import Netra
from netra.evaluation import DatasetItem

Netra.init(app_name="sample-app")

result = Netra.evaluation.add_dataset_item(
    dataset_id="dataset-123",
    item=DatasetItem(
        input="What is the return policy for electronics?",
        expected_output="Electronics can be returned within 30 days with original packaging.",
        tags=["policy", "returns"],
        metadata={"category": "electronics", "priority": "high"},
    ),
)

print(f"Item added: {result.id}")
print(f"Input: {result.input}")

Parameters

ParameterTypeDescription
dataset_idstrID of the target dataset
itemDatasetItemThe test item to add

DatasetItem

FieldTypeDescription
inputAnyThe input to pass to your task (required)
expected_outputAny?Expected output for comparison
tagslist[str]?Optional tags for the item
metadatadict?Optional metadata for evaluators

Response: AddDatasetItemResponse

FieldTypeDescription
idstrUnique item identifier
dataset_idstrParent dataset ID
project_idstrProject identifier
organization_idstrOrganization identifier
sourcestrSource of the item
source_idstr?Source reference ID
inputAnyThe input value
expected_outputAnyThe expected output
is_activeboolWhether the item is active
tagslist[str]Associated tags
metadatadict?Item metadata
created_bystrCreator identifier
updated_bystrLast updater identifier
created_atstrCreation timestamp
updated_atstrLast update timestamp
deleted_atstr?Deletion timestamp (if soft-deleted)

get_dataset

Retrieve a dataset and all its items by ID.
from netra import Netra

Netra.init(app_name="sample-app")

dataset = Netra.evaluation.get_dataset(dataset_id="dataset-123")

print(f"Total items: {len(dataset.items)}")

for item in dataset.items:
    print(f"ID: {item.id}")
    print(f"Input: {item.input}")
    print(f"Expected: {item.expected_output}")
    print("---")

Parameters

ParameterTypeDescription
dataset_idstrID of the dataset to retrieve

Response: GetDatasetItemsResponse

FieldTypeDescription
itemslist[DatasetRecord]List of dataset items

DatasetRecord

FieldTypeDescription
idstrItem identifier
dataset_idstrParent dataset ID
inputAnyThe input value
expected_outputAnyThe expected output

run_test_suite

Execute a test suite against a dataset, running your task function on each item and optionally applying evaluators.
from netra import Netra
from openai import OpenAI

Netra.init(app_name="sample-app")

client = OpenAI()

def my_task(input_data):
    """Task function that processes each dataset item."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": input_data},
        ],
    )
    return response.choices[0].message.content

# Get dataset
dataset = Netra.evaluation.get_dataset(dataset_id="dataset-123")

# Run test suite
result = Netra.evaluation.run_test_suite(
    name="GPT-4o Mini Evaluation",
    data=dataset,
    task=my_task,
    evaluators=["correctness", "relevance"],  # Optional evaluator IDs
    max_concurrency=10,
)

print(f"Run ID: {result['runId']}")
print(f"Items processed: {len(result['items'])}")

Parameters

ParameterTypeDescription
namestrName for this test run (required)
dataDatasetDataset from get_dataset()
taskCallableFunction that takes input and returns output
evaluatorslist?Optional evaluator IDs or configs
max_concurrencyintMax parallel task executions (default: 50)

Response

FieldTypeDescription
runIdstrUnique run identifier
itemslist[dict]Results for each processed item

Item Result

FieldTypeDescription
indexintItem index in dataset
statusstr"completed" or "failed"
traceIdstrTrace ID for observability
spanIdstrSpan ID for the task execution
testRunItemIdstrBackend item identifier
The task function receives the input field from each dataset item. Return the output that should be compared against expected_output by evaluators.

When to Use Which API

Dataset Management

create_dataset / add_dataset_item / get_datasetBuild and manage test datasets programmatically. Use for CI/CD pipelines or when generating test cases from production data.

Test Execution

run_test_suiteExecute your AI task against a dataset with automatic tracing and evaluation. Use for regression testing and model comparisons.

Advanced Workflows

create_runCreate runs without immediate execution. Use when you need custom orchestration or want to manage run lifecycle separately.

Evaluators

Evaluator IDs or ConfigsPass evaluator IDs to run_test_suite to automatically score outputs. Configure custom evaluators in the Netra dashboard.

Complete Example

from netra import Netra
from netra.evaluation import DatasetItem, TurnType
from openai import OpenAI

# Initialize
Netra.init(
    app_name="evaluation-demo",
    headers="x-api-key=your-api-key",
)
client = OpenAI()

# 1. Create a dataset
dataset_response = Netra.evaluation.create_dataset(
    name="Product FAQ Evaluation",
    tags=["faq", "products", "v2"],
    turn_type=TurnType.SINGLE,
)
dataset_id = dataset_response.id
print(f"Created dataset: {dataset_id}")

# 2. Add test items
test_cases = [
    {
        "input": "What is your return policy?",
        "expected_output": "Items can be returned within 30 days.",
    },
    {
        "input": "How long does shipping take?",
        "expected_output": "Standard shipping takes 3-5 business days.",
    },
    {
        "input": "Do you offer international shipping?",
        "expected_output": "Yes, we ship to over 50 countries.",
    },
]

for case in test_cases:
    Netra.evaluation.add_dataset_item(
        dataset_id=dataset_id,
        item=DatasetItem(
            input=case["input"],
            expected_output=case["expected_output"],
        ),
    )
print(f"Added {len(test_cases)} test items")

# 3. Define the task
def faq_agent(query: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a customer support agent. Answer concisely."},
            {"role": "user", "content": query},
        ],
    )
    return response.choices[0].message.content

# 4. Run the test suite
dataset = Netra.evaluation.get_dataset(dataset_id=dataset_id)

result = Netra.evaluation.run_test_suite(
    name="FAQ Agent v2 Evaluation",
    data=dataset,
    task=faq_agent,
    evaluators=["correctness", "relevance"],
    max_concurrency=5,
)

# 5. Review results
print(f"\nRun completed: {result['runId']}")
for item in result["items"]:
    print(f"  Item {item['index']}: {item['status']} (trace: {item['traceId']})")

print("\nView detailed results in Netra dashboard → Evaluation → Test Runs")

Next Steps

Last modified on February 10, 2026