from typing import Optional
from weave import EvaluationLogger
# Create a simple dataset
dataset = [
{"question": "What is 2 + 2?", "expected": "4"},
{"question": "What is the capital of France?", "expected": "Paris"},
{"question": "Name a primary color", "expected_one_of": ["red", "blue", "yellow"]},
]
# Define a scorer
@weave.op()
def accuracy_scorer(expected: str, output: str, expected_one_of: Optional[list[str]] = None) -> dict:
"""Score the accuracy of the model output."""
output_clean = output.strip().lower()
if expected_one_of:
is_correct = any(option.lower() in output_clean for option in expected_one_of)
else:
is_correct = expected.lower() in output_clean
return {"correct": is_correct, "score": 1.0 if is_correct else 0.0}
# Evaluate a model using Weave's EvaluationLogger
def evaluate_model(model: InferenceModel, dataset: list[dict]):
"""Run evaluation on a dataset using Weave's built-in evaluation framework."""
# Initialize EvaluationLogger BEFORE calling the model to capture token usage
# This is especially important for W&B Inference to track costs
# Convert model name to a valid format (replace non-alphanumeric chars with underscores)
safe_model_name = model.model_name.replace("/", "_").replace("-", "_").replace(".", "_")
eval_logger = EvaluationLogger(
model=safe_model_name,
dataset="qa_dataset"
)
for example in dataset:
# Get model prediction
output = model.predict(example["question"])
# Log the prediction
pred_logger = eval_logger.log_prediction(
inputs={"question": example["question"]},
output=output
)
# Score the output
score = accuracy_scorer(
expected=example.get("expected", ""),
output=output,
expected_one_of=example.get("expected_one_of")
)
# Log the score
pred_logger.log_score(
scorer="accuracy",
score=score["score"]
)
# Finish logging for this prediction
pred_logger.finish()
# Log summary - Weave automatically aggregates the accuracy scores
eval_logger.log_summary()
print(f"Evaluation complete for {model.model_name} (logged as: {safe_model_name}). View results in the Weave UI.")
# Compare multiple models - a key feature of Weave's evaluation framework
models_to_compare = [
llama_model,
deepseek_model,
]
for model in models_to_compare:
evaluate_model(model, dataset)
# In the Weave UI, navigate to the Evals tab to compare results across models