Python API
Basic Usage
import asyncio
from truthfulness_evaluator import create_truthfulness_graph
from truthfulness_evaluator.core.config import EvaluatorConfig
async def evaluate():
# Configure
config = EvaluatorConfig(
verification_models=["gpt-4o"],
enable_web_search=True
)
# Create graph
graph = create_truthfulness_graph()
# Run evaluation
result = await graph.ainvoke({
"document": "Python was created in 1991.",
"document_path": "test.md",
"root_path": None,
"claims": [],
"current_claim_index": 0,
"verifications": [],
"evidence_cache": {},
"config": config.model_dump(),
"final_report": None
})
return result["final_report"]
report = asyncio.run(evaluate())
Components
Claim Extraction
from truthfulness_evaluator.llm.chains.extraction import SimpleClaimExtractionChain
extractor = SimpleClaimExtractionChain(model="gpt-4o-mini")
claims = await extractor.extract(
document="Python was created in 1991.",
source_path="test.md"
)
for claim in claims:
print(f"{claim.id}: {claim.text}")
Verification
from truthfulness_evaluator.llm.chains.verification import VerificationChain
from truthfulness_evaluator.models import Claim, Evidence
verifier = VerificationChain(model_name="gpt-4o")
claim = Claim(id="c1", text="Python was created in 1991", source_document="test.md")
evidence = [Evidence(
source="python.org",
source_type="web",
content="Python was created by Guido van Rossum...",
relevance_score=0.9
)]
result = await verifier.verify(claim, evidence)
print(f"{result.verdict} ({result.confidence:.0%})")
Consensus
from truthfulness_evaluator.llm.chains.consensus import ConsensusChain
consensus = ConsensusChain(
model_names=["gpt-4o", "gpt-4o-mini"],
confidence_threshold=0.7
)
result = await consensus.verify(claim, evidence)
print(result.model_votes) # {'gpt-4o': 'SUPPORTS', 'gpt-4o-mini': 'SUPPORTS'}
Evidence Processing
from truthfulness_evaluator.llm.chains.evidence import EvidenceProcessor
processor = EvidenceProcessor(model="gpt-4o-mini")
evidence, summary = await processor.analyze_evidence(claim, evidence_list)
print(summary)
for e in evidence:
print(f"{e.source}: relevance={e.relevance_score:.0%}")
Streaming
async for event in graph.astream(
input_state,
config={"configurable": {"thread_id": "eval_1"}},
stream_mode="updates"
):
if "extract_claims" in event:
print(f"Extracted {len(event['extract_claims']['claims'])} claims")
elif "verify_claim" in event:
print("Verified claim")
Checkpointing
Resume interrupted evaluations:
config = {"configurable": {"thread_id": "eval_1"}}
# Start
result = await graph.ainvoke(input_state, config)
# Resume after interruption
result = await graph.ainvoke(None, config)
Human-in-the-Loop
from langgraph.types import Command
# Interrupt for human review
human_input = interrupt({
"claim": claim.text,
"proposed_verdict": verification.verdict,
"question": "Approve?"
})
# Resume with human input
result = await graph.ainvoke(
Command(resume={"response": "approve"}),
config
)
Custom Workflows
Build your own graph:
from langgraph.graph import StateGraph, START, END
builder = StateGraph(TruthfulnessState)
builder.add_node("extract", extract_claims_node)
builder.add_node("verify", verify_claim_node)
builder.add_edge(START, "extract")
builder.add_edge("extract", "verify")
builder.add_edge("verify", END)
graph = builder.compile()