The Netra SDK exposes an evaluation client that lets you:
Manage datasets - Create datasets and add test items
Run test suites - Execute tasks against datasets with automatic tracing
Apply evaluators - Score outputs using built-in or custom evaluators
This page shows how to use Netra.evaluation to manage datasets, run test suites, and programmatically evaluate your AI applications.
Getting Started
The evaluation client is available on the main Netra entry point after initialization.
from netra import Netra
Netra.init( app_name = "sample-app" )
# Access the evaluation client
Netra.evaluation.create_dataset( ... )
Netra.evaluation.add_dataset_item( ... )
Netra.evaluation.get_dataset( ... )
Netra.evaluation.run_test_suite( ... )
create_dataset
Create an empty dataset that can hold test items for evaluation runs.
from netra import Netra
from netra.evaluation import TurnType
Netra.init( app_name = "sample-app" )
result = Netra.evaluation.create_dataset(
name = "Customer Support QA" ,
tags = [ "support" , "qa" , "v1" ],
turn_type = TurnType. SINGLE , # or TurnType.MULTI for multi-turn
)
print ( f "Dataset created: { result.id } " )
print ( f "Name: { result.name } " )
print ( f "Tags: { result.tags } " )
Parameters
Parameter Type Description namestrName of the dataset (required) tagslist[str]?Optional tags for categorization turn_typeTurnTypeSINGLE for single-turn or MULTI for multi-turn datasets
Response: CreateDatasetResponse
Field Type Description idstrUnique dataset identifier namestrDataset name tagslist[str]Associated tags project_idstrProject identifier organization_idstrOrganization identifier created_bystrCreator identifier updated_bystrLast updater identifier created_atstrCreation timestamp updated_atstrLast update timestamp deleted_atstr?Deletion timestamp (if soft-deleted)
Value Description TurnType.SINGLESingle-turn evaluation (one input → one output) TurnType.MULTIMulti-turn evaluation (conversation sequences)
add_dataset_item
Add a single test item to an existing dataset.
from netra import Netra
from netra.evaluation import DatasetItem
Netra.init( app_name = "sample-app" )
result = Netra.evaluation.add_dataset_item(
dataset_id = "dataset-123" ,
item = DatasetItem(
input = "What is the return policy for electronics?" ,
expected_output = "Electronics can be returned within 30 days with original packaging." ,
tags = [ "policy" , "returns" ],
metadata = { "category" : "electronics" , "priority" : "high" },
),
)
print ( f "Item added: { result.id } " )
print ( f "Input: { result.input } " )
Parameters
Parameter Type Description dataset_idstrID of the target dataset itemDatasetItemThe test item to add
DatasetItem
Field Type Description inputAnyThe input to pass to your task (required) expected_outputAny?Expected output for comparison tagslist[str]?Optional tags for the item metadatadict?Optional metadata for evaluators
Response: AddDatasetItemResponse
Field Type Description idstrUnique item identifier dataset_idstrParent dataset ID project_idstrProject identifier organization_idstrOrganization identifier sourcestrSource of the item source_idstr?Source reference ID inputAnyThe input value expected_outputAnyThe expected output is_activeboolWhether the item is active tagslist[str]Associated tags metadatadict?Item metadata created_bystrCreator identifier updated_bystrLast updater identifier created_atstrCreation timestamp updated_atstrLast update timestamp deleted_atstr?Deletion timestamp (if soft-deleted)
get_dataset
Retrieve a dataset and all its items by ID.
from netra import Netra
Netra.init( app_name = "sample-app" )
dataset = Netra.evaluation.get_dataset( dataset_id = "dataset-123" )
print ( f "Total items: { len (dataset.items) } " )
for item in dataset.items:
print ( f "ID: { item.id } " )
print ( f "Input: { item.input } " )
print ( f "Expected: { item.expected_output } " )
print ( "---" )
Parameters
Parameter Type Description dataset_idstrID of the dataset to retrieve
Response: GetDatasetItemsResponse
Field Type Description itemslist[DatasetRecord]List of dataset items
DatasetRecord
Field Type Description idstrItem identifier dataset_idstrParent dataset ID inputAnyThe input value expected_outputAnyThe expected output
run_test_suite
Execute a test suite against a dataset, running your task function on each item and optionally applying evaluators.
from netra import Netra
from openai import OpenAI
Netra.init( app_name = "sample-app" )
client = OpenAI()
def my_task ( input_data ):
"""Task function that processes each dataset item."""
response = client.chat.completions.create(
model = "gpt-4o-mini" ,
messages = [
{ "role" : "system" , "content" : "You are a helpful assistant." },
{ "role" : "user" , "content" : input_data},
],
)
return response.choices[ 0 ].message.content
# Get dataset
dataset = Netra.evaluation.get_dataset( dataset_id = "dataset-123" )
# Run test suite
result = Netra.evaluation.run_test_suite(
name = "GPT-4o Mini Evaluation" ,
data = dataset,
task = my_task,
evaluators = [ "correctness" , "relevance" ], # Optional evaluator IDs
max_concurrency = 10 ,
)
print ( f "Run ID: { result[ 'runId' ] } " )
print ( f "Items processed: { len (result[ 'items' ]) } " )
Parameters
Parameter Type Description namestrName for this test run (required) dataDatasetDataset from get_dataset() taskCallableFunction that takes input and returns output evaluatorslist?Optional evaluator IDs or configs max_concurrencyintMax parallel task executions (default: 50)
Response
Field Type Description runIdstrUnique run identifier itemslist[dict]Results for each processed item
Item Result
Field Type Description indexintItem index in dataset statusstr"completed" or "failed"traceIdstrTrace ID for observability spanIdstrSpan ID for the task execution testRunItemIdstrBackend item identifier
The task function receives the input field from each dataset item. Return the output that should be compared against expected_output by evaluators.
When to Use Which API
Dataset Management create_dataset / add_dataset_item / get_datasetBuild and manage test datasets programmatically. Use for CI/CD pipelines or when generating test cases from production data.
Test Execution run_test_suiteExecute your AI task against a dataset with automatic tracing and evaluation. Use for regression testing and model comparisons.
Advanced Workflows create_runCreate runs without immediate execution. Use when you need custom orchestration or want to manage run lifecycle separately.
Evaluators Evaluator IDs or Configs Pass evaluator IDs to run_test_suite to automatically score outputs. Configure custom evaluators in the Netra dashboard.
Complete Example
from netra import Netra
from netra.evaluation import DatasetItem, TurnType
from openai import OpenAI
# Initialize
Netra.init(
app_name = "evaluation-demo" ,
headers = "x-api-key=your-api-key" ,
)
client = OpenAI()
# 1. Create a dataset
dataset_response = Netra.evaluation.create_dataset(
name = "Product FAQ Evaluation" ,
tags = [ "faq" , "products" , "v2" ],
turn_type = TurnType. SINGLE ,
)
dataset_id = dataset_response.id
print ( f "Created dataset: { dataset_id } " )
# 2. Add test items
test_cases = [
{
"input" : "What is your return policy?" ,
"expected_output" : "Items can be returned within 30 days." ,
},
{
"input" : "How long does shipping take?" ,
"expected_output" : "Standard shipping takes 3-5 business days." ,
},
{
"input" : "Do you offer international shipping?" ,
"expected_output" : "Yes, we ship to over 50 countries." ,
},
]
for case in test_cases:
Netra.evaluation.add_dataset_item(
dataset_id = dataset_id,
item = DatasetItem(
input = case[ "input" ],
expected_output = case[ "expected_output" ],
),
)
print ( f "Added { len (test_cases) } test items" )
# 3. Define the task
def faq_agent ( query : str ) -> str :
response = client.chat.completions.create(
model = "gpt-4o-mini" ,
messages = [
{ "role" : "system" , "content" : "You are a customer support agent. Answer concisely." },
{ "role" : "user" , "content" : query},
],
)
return response.choices[ 0 ].message.content
# 4. Run the test suite
dataset = Netra.evaluation.get_dataset( dataset_id = dataset_id)
result = Netra.evaluation.run_test_suite(
name = "FAQ Agent v2 Evaluation" ,
data = dataset,
task = faq_agent,
evaluators = [ "correctness" , "relevance" ],
max_concurrency = 5 ,
)
# 5. Review results
print ( f " \n Run completed: { result[ 'runId' ] } " )
for item in result[ "items" ]:
print ( f " Item { item[ 'index' ] } : { item[ 'status' ] } (trace: { item[ 'traceId' ] } )" )
print ( " \n View detailed results in Netra dashboard → Evaluation → Test Runs" )
Next Steps