import asyncio
import weave
from verdict import Pipeline
from verdict.common.judge import JudgeUnit
from verdict.schema import Schema
# Initialize Weave
# highlight-next-line
weave.init("verdict_demo")
# Create evaluation model
class SentimentEvaluator(weave.Model):
@weave.op()
async def predict(self, text: str) -> dict:
pipeline = Pipeline()
pipeline = pipeline >> JudgeUnit().prompt(
"Classify sentiment as positive, negative, or neutral: {source.text}"
)
data = Schema.of(text=text)
result = pipeline.run(data)
return {"sentiment": result}
# Test data
texts = [
"I love this product, it's amazing!",
"This is terrible, worst purchase ever.",
"The weather is okay today."
]
labels = ["positive", "negative", "neutral"]
examples = [
{"id": str(i), "text": texts[i], "target": labels[i]}
for i in range(len(texts))
]
# Scoring function
@weave.op()
def sentiment_accuracy(target: str, output: dict) -> dict:
predicted = output.get("sentiment", "").lower()
return {"correct": target.lower() in predicted}
model = SentimentEvaluator()
evaluation = weave.Evaluation(
dataset=examples,
scorers=[sentiment_accuracy],
)
scores = asyncio.run(evaluation.evaluate(model))
# if you're in a Jupyter Notebook, run:
# scores = await evaluation.evaluate(model)
print(scores)