Chains
Legacy API
The chains module is being replaced by the pluggable adapter architecture. New code should use the adapter classes in extractors/, gatherers/, verifiers/, and formatters/ packages instead. See the Adapters API Reference for the modern interface.
from truthfulness_evaluator.llm.chains.extraction import SimpleClaimExtractionChain
extractor = SimpleClaimExtractionChain(model="gpt-4o-mini")
claims = await extractor.extract(
document="Python was created in 1991...",
source_path="README.md",
max_claims=10
)
Output: list[Claim]
Uses structured output (ClaimExtractionOutput):
class ClaimExtractionOutput(BaseModel):
claims: list[ExtractedClaim]
class ExtractedClaim(BaseModel):
text: str
claim_type: str # "explicit", "implicit", "inferred"
from truthfulness_evaluator.llm.chains.extraction import TripletExtractionChain
extractor = TripletExtractionChain(model="gpt-4o-mini")
claims = await extractor.extract(document, source_path)
Extracts subject-relation-object triplets.
Verification
VerificationChain
from truthfulness_evaluator.llm.chains.verification import VerificationChain
verifier = VerificationChain(model_name="gpt-4o")
result = await verifier.verify(claim, evidence)
Output: VerificationResult
Uses structured output (VerificationOutput):
class VerificationOutput(BaseModel):
verdict: str # "SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO"
confidence: float # 0.0 to 1.0
reasoning: str
key_evidence: str | None
Consensus
ConsensusChain
from truthfulness_evaluator.llm.chains.consensus import ConsensusChain
consensus = ConsensusChain(
model_names=["gpt-4o", "gpt-4o-mini"],
weights={"gpt-4o": 0.6, "gpt-4o-mini": 0.4},
confidence_threshold=0.7
)
result = await consensus.verify(claim, evidence)
Weighted voting. Models vote, weights applied, majority wins.
ICEConsensusChain
from truthfulness_evaluator.llm.chains.consensus import ICEConsensusChain
ice = ICEConsensusChain(
model_names=["gpt-4o", "gpt-4o-mini"],
max_rounds=3
)
result = await ice.verify(claim, evidence)
Iterative Consensus Ensemble. Models critique each other over multiple rounds.
Evidence Processing
EvidenceProcessor
from truthfulness_evaluator.llm.chains.evidence import EvidenceProcessor
processor = EvidenceProcessor(model="gpt-4o-mini")
# Analyze evidence relevance
evidence, summary = await processor.analyze_evidence(claim, evidence_list)
# Synthesize multiple pieces
synthesis = await processor.synthesize_evidence(claim, evidence_list)
Output: Analyzed evidence with scores + summary text
Uses structured output (EvidenceAnalysisOutput):
class EvidenceAnalysisOutput(BaseModel):
evidence_analysis: list[EvidenceAnalysisItem]
summary: str
class EvidenceAnalysisItem(BaseModel):
index: int
relevance: float
supports: bool | None
credibility: float
reasoning: str
Custom Chains
Build your own:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
# Define output structure
class MyOutput(BaseModel):
result: str
confidence: float
# Create chain
prompt = ChatPromptTemplate.from_template("Verify: {claim}")
llm = ChatOpenAI(model="gpt-4o").with_structured_output(MyOutput)
chain = prompt | llm
# Use
result = await chain.ainvoke({"claim": "Python was created in 1991"})
API Reference
Fallback claim extraction using structured LLM outputs.
Source code in src/truthfulness_evaluator/llm/chains/extraction.py
| class SimpleClaimExtractionChain:
"""Fallback claim extraction using structured LLM outputs."""
def __init__(self, model: str = "gpt-4o-mini"):
self.model = model
self._llm = None
@property
def llm(self):
"""Lazy initialization of LLM with structured output."""
if self._llm is None:
base_llm = create_chat_model(self.model, temperature=0)
# Use structured output
self._llm = base_llm.with_structured_output(ClaimExtractionOutput)
return self._llm
async def extract(
self, document: str, source_path: str, max_claims: Optional[int] = None
) -> list[Claim]:
"""Extract claims using structured LLM output."""
chain = CLAIM_EXTRACTION_PROMPT | self.llm
try:
result: ClaimExtractionOutput = await chain.ainvoke({"text": document})
claims = []
for i, extracted in enumerate(result.claims):
if max_claims and i >= max_claims:
break
if extracted.text.strip():
claims.append(
Claim(
id=f"claim_{i:03d}",
text=extracted.text.strip(),
source_document=source_path,
claim_type=(
extracted.claim_type
if extracted.claim_type in ["explicit", "implicit", "inferred"]
else "explicit"
),
)
)
return claims
except Exception as e:
logger.warning(f"Simple extraction failed: {e}")
return []
|
Lazy initialization of LLM with structured output.
Extract claims using structured LLM output.
Source code in src/truthfulness_evaluator/llm/chains/extraction.py
| async def extract(
self, document: str, source_path: str, max_claims: Optional[int] = None
) -> list[Claim]:
"""Extract claims using structured LLM output."""
chain = CLAIM_EXTRACTION_PROMPT | self.llm
try:
result: ClaimExtractionOutput = await chain.ainvoke({"text": document})
claims = []
for i, extracted in enumerate(result.claims):
if max_claims and i >= max_claims:
break
if extracted.text.strip():
claims.append(
Claim(
id=f"claim_{i:03d}",
text=extracted.text.strip(),
source_document=source_path,
claim_type=(
extracted.claim_type
if extracted.claim_type in ["explicit", "implicit", "inferred"]
else "explicit"
),
)
)
return claims
except Exception as e:
logger.warning(f"Simple extraction failed: {e}")
return []
|
Extract claims as subject-relation-object triplets using structured output.
Source code in src/truthfulness_evaluator/llm/chains/extraction.py
| class TripletExtractionChain:
"""Extract claims as subject-relation-object triplets using structured output."""
def __init__(self, model: str = "gpt-4o-mini"):
self.model = model
self._llm = None
@property
def llm(self):
"""Lazy initialization of LLM with structured output."""
if self._llm is None:
base_llm = create_chat_model(self.model, temperature=0)
self._llm = base_llm.with_structured_output(TripletExtractionOutput)
return self._llm
async def extract(
self, document: str, source_path: str, max_claims: Optional[int] = None
) -> list[Claim]:
"""Extract claims as triplets using structured output."""
chain = TRIPLET_EXTRACTION_PROMPT | self.llm
try:
result: TripletExtractionOutput = await chain.ainvoke({"text": document})
claims = []
for i, triplet in enumerate(result.triplets):
if max_claims and i >= max_claims:
break
claim_text = f"{triplet.subject} {triplet.relation} {triplet.object}".strip()
if claim_text:
claims.append(
Claim(
id=f"claim_{i:03d}",
text=claim_text,
source_document=source_path,
context=triplet.context,
claim_type="explicit",
)
)
return claims
except Exception as e:
logger.warning(f"Triplet extraction failed: {e}")
return []
|
Lazy initialization of LLM with structured output.
Extract claims as triplets using structured output.
Source code in src/truthfulness_evaluator/llm/chains/extraction.py
| async def extract(
self, document: str, source_path: str, max_claims: Optional[int] = None
) -> list[Claim]:
"""Extract claims as triplets using structured output."""
chain = TRIPLET_EXTRACTION_PROMPT | self.llm
try:
result: TripletExtractionOutput = await chain.ainvoke({"text": document})
claims = []
for i, triplet in enumerate(result.triplets):
if max_claims and i >= max_claims:
break
claim_text = f"{triplet.subject} {triplet.relation} {triplet.object}".strip()
if claim_text:
claims.append(
Claim(
id=f"claim_{i:03d}",
text=claim_text,
source_document=source_path,
context=triplet.context,
claim_type="explicit",
)
)
return claims
except Exception as e:
logger.warning(f"Triplet extraction failed: {e}")
return []
|
Verification
truthfulness_evaluator.llm.chains.verification.VerificationChain
Single-model verification chain with structured outputs.
Source code in src/truthfulness_evaluator/llm/chains/verification.py
| class VerificationChain:
"""Single-model verification chain with structured outputs."""
def __init__(self, model_name: str = "gpt-4o"):
self.model_name = model_name
self._llm = None
@property
def llm(self):
"""Lazy initialization of LLM with structured output."""
if self._llm is None:
base_llm = create_chat_model(self.model_name, temperature=0)
# Use structured output
self._llm = base_llm.with_structured_output(VerificationOutput)
return self._llm
async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
"""Verify a claim against evidence using structured output."""
# Build evidence text with relevance indicators
if evidence:
evidence_parts = []
for i, e in enumerate(evidence[:4], 1): # Top 4 evidence items
support_indicator = ""
if e.supports_claim is True:
support_indicator = "[SUPPORTS]"
elif e.supports_claim is False:
support_indicator = "[REFUTES]"
else:
support_indicator = "[NEUTRAL]"
evidence_parts.append(
f"\n--- Evidence {i} ({e.source_type}) {support_indicator} ---\n"
f"Source: {e.source}\n"
f"Relevance: {e.relevance_score:.0%}\n"
f"Content: {e.content[:600]}"
)
evidence_text = "\n".join(evidence_parts)
else:
evidence_text = "No evidence provided."
chain = VERIFICATION_PROMPT | self.llm
try:
result: VerificationOutput = await chain.ainvoke(
{"claim": claim.text, "evidence": evidence_text}
)
# Normalize verdict
verdict = result.verdict.upper()
if verdict not in ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO"]:
verdict = "NOT_ENOUGH_INFO"
# Combine reasoning with key evidence
full_explanation = result.reasoning
if result.key_evidence:
full_explanation += f"\n\nKey evidence: {result.key_evidence}"
# Adjust confidence based on evidence quality
confidence = max(0.0, min(1.0, result.confidence))
if not evidence:
confidence = min(confidence, 0.3)
return VerificationResult(
claim_id=claim.id,
verdict=verdict,
confidence=confidence,
evidence=evidence,
explanation=full_explanation,
model_votes={self.model_name: verdict},
)
except Exception as e:
# Fallback result on error
return VerificationResult(
claim_id=claim.id,
verdict="NOT_ENOUGH_INFO",
confidence=0.0,
evidence=evidence,
explanation=f"Verification failed: {str(e)}",
model_votes={self.model_name: "NOT_ENOUGH_INFO"},
)
|
llm
property
Lazy initialization of LLM with structured output.
verify(claim, evidence)
async
Verify a claim against evidence using structured output.
Source code in src/truthfulness_evaluator/llm/chains/verification.py
| async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
"""Verify a claim against evidence using structured output."""
# Build evidence text with relevance indicators
if evidence:
evidence_parts = []
for i, e in enumerate(evidence[:4], 1): # Top 4 evidence items
support_indicator = ""
if e.supports_claim is True:
support_indicator = "[SUPPORTS]"
elif e.supports_claim is False:
support_indicator = "[REFUTES]"
else:
support_indicator = "[NEUTRAL]"
evidence_parts.append(
f"\n--- Evidence {i} ({e.source_type}) {support_indicator} ---\n"
f"Source: {e.source}\n"
f"Relevance: {e.relevance_score:.0%}\n"
f"Content: {e.content[:600]}"
)
evidence_text = "\n".join(evidence_parts)
else:
evidence_text = "No evidence provided."
chain = VERIFICATION_PROMPT | self.llm
try:
result: VerificationOutput = await chain.ainvoke(
{"claim": claim.text, "evidence": evidence_text}
)
# Normalize verdict
verdict = result.verdict.upper()
if verdict not in ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO"]:
verdict = "NOT_ENOUGH_INFO"
# Combine reasoning with key evidence
full_explanation = result.reasoning
if result.key_evidence:
full_explanation += f"\n\nKey evidence: {result.key_evidence}"
# Adjust confidence based on evidence quality
confidence = max(0.0, min(1.0, result.confidence))
if not evidence:
confidence = min(confidence, 0.3)
return VerificationResult(
claim_id=claim.id,
verdict=verdict,
confidence=confidence,
evidence=evidence,
explanation=full_explanation,
model_votes={self.model_name: verdict},
)
except Exception as e:
# Fallback result on error
return VerificationResult(
claim_id=claim.id,
verdict="NOT_ENOUGH_INFO",
confidence=0.0,
evidence=evidence,
explanation=f"Verification failed: {str(e)}",
model_votes={self.model_name: "NOT_ENOUGH_INFO"},
)
|
Consensus
truthfulness_evaluator.llm.chains.consensus.ConsensusChain
Multi-model consensus with weighted voting.
Source code in src/truthfulness_evaluator/llm/chains/consensus.py
| class ConsensusChain:
"""Multi-model consensus with weighted voting."""
def __init__(
self,
model_names: list[str],
weights: dict[str, float] | None = None,
confidence_threshold: float = 0.7,
):
self.model_names = model_names
self.weights = weights or {m: 1.0 / len(model_names) for m in model_names}
self.confidence_threshold = confidence_threshold
self._chains = None
@property
def chains(self) -> list[VerificationChain]:
"""Lazy initialization of verification chains."""
if self._chains is None:
self._chains = [VerificationChain(m) for m in self.model_names]
return self._chains
async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
"""Verify claim using multi-model consensus."""
# Get votes from all models in parallel
results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])
# Collect votes and confidences
votes = {}
confidences = {}
explanations = []
for i, result in enumerate(results):
model = self.model_names[i]
votes[model] = result.verdict
confidences[model] = result.confidence
explanations.append(f"{model}: {result.verdict} (confidence: {result.confidence:.2f})")
# Weighted voting
weighted_votes = Counter()
for model, verdict in votes.items():
weight = self.weights.get(model, 1.0 / len(self.model_names))
weighted_votes[verdict] += weight
# Get winning verdict
final_verdict = weighted_votes.most_common(1)[0][0]
# Calculate overall confidence
avg_confidence = sum(confidences.values()) / len(confidences)
# If confidence too low, mark as NEI
if avg_confidence < self.confidence_threshold:
final_verdict = "NOT_ENOUGH_INFO"
# Debug: show all votes
vote_str = ", ".join([f"{m}: {v}" for m, v in votes.items()])
logger.debug(f"Model votes: {vote_str}")
# Combine evidence from all results
all_evidence = []
for r in results:
all_evidence.extend(r.evidence)
return VerificationResult(
claim_id=claim.id,
verdict=final_verdict,
confidence=avg_confidence,
evidence=all_evidence[:5], # Deduplicate and limit
explanation="\n".join([f"Consensus: {final_verdict}", "Model votes:", *explanations]),
model_votes=votes,
)
|
chains: list[VerificationChain]
property
Lazy initialization of verification chains.
verify(claim, evidence)
async
Verify claim using multi-model consensus.
Source code in src/truthfulness_evaluator/llm/chains/consensus.py
| async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
"""Verify claim using multi-model consensus."""
# Get votes from all models in parallel
results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])
# Collect votes and confidences
votes = {}
confidences = {}
explanations = []
for i, result in enumerate(results):
model = self.model_names[i]
votes[model] = result.verdict
confidences[model] = result.confidence
explanations.append(f"{model}: {result.verdict} (confidence: {result.confidence:.2f})")
# Weighted voting
weighted_votes = Counter()
for model, verdict in votes.items():
weight = self.weights.get(model, 1.0 / len(self.model_names))
weighted_votes[verdict] += weight
# Get winning verdict
final_verdict = weighted_votes.most_common(1)[0][0]
# Calculate overall confidence
avg_confidence = sum(confidences.values()) / len(confidences)
# If confidence too low, mark as NEI
if avg_confidence < self.confidence_threshold:
final_verdict = "NOT_ENOUGH_INFO"
# Debug: show all votes
vote_str = ", ".join([f"{m}: {v}" for m, v in votes.items()])
logger.debug(f"Model votes: {vote_str}")
# Combine evidence from all results
all_evidence = []
for r in results:
all_evidence.extend(r.evidence)
return VerificationResult(
claim_id=claim.id,
verdict=final_verdict,
confidence=avg_confidence,
evidence=all_evidence[:5], # Deduplicate and limit
explanation="\n".join([f"Consensus: {final_verdict}", "Model votes:", *explanations]),
model_votes=votes,
)
|
truthfulness_evaluator.llm.chains.consensus.ICEConsensusChain
Iterative Consensus Ensemble - models critique each other.
Source code in src/truthfulness_evaluator/llm/chains/consensus.py
| class ICEConsensusChain:
"""Iterative Consensus Ensemble - models critique each other."""
def __init__(
self, model_names: list[str], max_rounds: int = 3, confidence_threshold: float = 0.7
):
self.model_names = model_names
self.max_rounds = max_rounds
self.confidence_threshold = confidence_threshold
self._chains = None
@property
def chains(self) -> list[VerificationChain]:
"""Lazy initialization of verification chains."""
if self._chains is None:
self._chains = [VerificationChain(m) for m in self.model_names]
return self._chains
async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
"""Verify claim using ICE (Iterative Consensus Ensemble)."""
# Round 1: Initial votes
results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])
votes = {self.model_names[i]: r.verdict for i, r in enumerate(results)}
# Rounds 2-N: Critique and revise
for round_num in range(2, self.max_rounds + 1):
if self._consensus_reached(votes):
break
# Models critique each other's reasoning
critiques = await self._gather_critiques(claim, evidence, votes, round_num)
# Revise votes based on critiques
new_results = await asyncio.gather(
*[
self._revise_vote(chain, claim, evidence, votes, critiques, round_num)
for chain in self.chains
]
)
votes = {self.model_names[i]: r.verdict for i, r in enumerate(new_results)}
# Final aggregation
return self._aggregate_results(claim, votes, results, evidence)
def _consensus_reached(self, votes: dict[str, Verdict]) -> bool:
"""Check if all models agree."""
return len(set(votes.values())) == 1
async def _gather_critiques(
self, claim: Claim, evidence: list[Evidence], votes: dict[str, Verdict], round_num: int
) -> dict[str, str]:
"""Gather critiques from each model about others' reasoning."""
# Simplified - in full implementation, each model critiques others
return {model: f"Round {round_num} critique" for model in self.model_names}
async def _revise_vote(
self,
chain: VerificationChain,
claim: Claim,
evidence: list[Evidence],
current_votes: dict[str, Verdict],
critiques: dict[str, str],
round_num: int,
) -> VerificationResult:
"""Revise vote based on critiques."""
# For now, just re-verify (full implementation would incorporate critiques)
return await chain.verify(claim, evidence)
def _aggregate_results(
self,
claim: Claim,
votes: dict[str, Verdict],
results: list[VerificationResult],
evidence: list[Evidence],
) -> VerificationResult:
"""Aggregate final results."""
# Simple majority vote
vote_counts = Counter(votes.values())
final_verdict = vote_counts.most_common(1)[0][0]
# Average confidence
avg_confidence = sum(r.confidence for r in results) / len(results)
if avg_confidence < self.confidence_threshold:
final_verdict = "NOT_ENOUGH_INFO"
return VerificationResult(
claim_id=claim.id,
verdict=final_verdict,
confidence=avg_confidence,
evidence=evidence,
explanation=f"ICE Consensus after up to {self.max_rounds} rounds",
model_votes=votes,
)
|
chains: list[VerificationChain]
property
Lazy initialization of verification chains.
verify(claim, evidence)
async
Verify claim using ICE (Iterative Consensus Ensemble).
Source code in src/truthfulness_evaluator/llm/chains/consensus.py
| async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
"""Verify claim using ICE (Iterative Consensus Ensemble)."""
# Round 1: Initial votes
results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])
votes = {self.model_names[i]: r.verdict for i, r in enumerate(results)}
# Rounds 2-N: Critique and revise
for round_num in range(2, self.max_rounds + 1):
if self._consensus_reached(votes):
break
# Models critique each other's reasoning
critiques = await self._gather_critiques(claim, evidence, votes, round_num)
# Revise votes based on critiques
new_results = await asyncio.gather(
*[
self._revise_vote(chain, claim, evidence, votes, critiques, round_num)
for chain in self.chains
]
)
votes = {self.model_names[i]: r.verdict for i, r in enumerate(new_results)}
# Final aggregation
return self._aggregate_results(claim, votes, results, evidence)
|
Evidence Processing
truthfulness_evaluator.llm.chains.evidence.EvidenceProcessor
Process and analyze evidence for claims using structured outputs.
Source code in src/truthfulness_evaluator/llm/chains/evidence.py
| class EvidenceProcessor:
"""Process and analyze evidence for claims using structured outputs."""
def __init__(self, model: str = "gpt-4o-mini"):
self.model = model
self._llm = None
@property
def llm(self):
"""Lazy initialization of LLM with structured output."""
if self._llm is None:
base_llm = create_chat_model(self.model, temperature=0)
self._llm = base_llm.with_structured_output(EvidenceAnalysisOutput)
return self._llm
async def analyze_evidence(
self, claim: Claim, evidence_list: list[Evidence]
) -> tuple[list[Evidence], str]:
"""
Analyze evidence and determine which pieces are relevant.
Returns:
Tuple of (filtered_evidence, analysis_summary)
"""
if not evidence_list:
return [], "No evidence provided"
# Build evidence text
evidence_text = "\n\n---\n\n".join(
[
f"[{i}] Source: {e.source}\nType: {e.source_type}\nContent: {e.content[:800]}"
for i, e in enumerate(evidence_list[:5]) # Top 5 pieces
]
)
chain = EVIDENCE_ANALYSIS_PROMPT | self.llm
try:
result: EvidenceAnalysisOutput = await chain.ainvoke(
{"claim": claim.text, "evidence": evidence_text}
)
# Update evidence with analysis
for analysis in result.evidence_analysis:
idx = analysis.index
if 0 <= idx < len(evidence_list):
evidence_list[idx].relevance_score = max(0.0, min(1.0, analysis.relevance))
evidence_list[idx].supports_claim = analysis.supports
evidence_list[idx].credibility_score = max(0.0, min(1.0, analysis.credibility))
# Sort by relevance
evidence_list.sort(key=lambda e: e.relevance_score, reverse=True)
return evidence_list, result.summary
except Exception as e:
# If analysis fails, return original evidence
return evidence_list, f"Analysis failed: {str(e)}"
async def synthesize_evidence(self, claim: Claim, evidence_list: list[Evidence]) -> str:
"""
Synthesize multiple pieces of evidence into a coherent summary.
Returns:
Summary text
"""
if not evidence_list:
return "No evidence available"
# Filter to high-relevance evidence
good_evidence = [e for e in evidence_list if e.relevance_score >= 0.5][:3]
if not good_evidence:
return "No highly relevant evidence found"
# Simple synthesis without LLM for speed
parts = []
for e in good_evidence:
support = (
"supports"
if e.supports_claim
else "refutes" if e.supports_claim is False else "is neutral on"
)
parts.append(
f"{e.source} ({e.source_type}) {support} the claim with {e.relevance_score:.0%} relevance"
)
return "; ".join(parts)
|
llm
property
Lazy initialization of LLM with structured output.
analyze_evidence(claim, evidence_list)
async
Analyze evidence and determine which pieces are relevant.
Returns:
| Type |
Description |
tuple[list[Evidence], str]
|
Tuple of (filtered_evidence, analysis_summary)
|
Source code in src/truthfulness_evaluator/llm/chains/evidence.py
| async def analyze_evidence(
self, claim: Claim, evidence_list: list[Evidence]
) -> tuple[list[Evidence], str]:
"""
Analyze evidence and determine which pieces are relevant.
Returns:
Tuple of (filtered_evidence, analysis_summary)
"""
if not evidence_list:
return [], "No evidence provided"
# Build evidence text
evidence_text = "\n\n---\n\n".join(
[
f"[{i}] Source: {e.source}\nType: {e.source_type}\nContent: {e.content[:800]}"
for i, e in enumerate(evidence_list[:5]) # Top 5 pieces
]
)
chain = EVIDENCE_ANALYSIS_PROMPT | self.llm
try:
result: EvidenceAnalysisOutput = await chain.ainvoke(
{"claim": claim.text, "evidence": evidence_text}
)
# Update evidence with analysis
for analysis in result.evidence_analysis:
idx = analysis.index
if 0 <= idx < len(evidence_list):
evidence_list[idx].relevance_score = max(0.0, min(1.0, analysis.relevance))
evidence_list[idx].supports_claim = analysis.supports
evidence_list[idx].credibility_score = max(0.0, min(1.0, analysis.credibility))
# Sort by relevance
evidence_list.sort(key=lambda e: e.relevance_score, reverse=True)
return evidence_list, result.summary
except Exception as e:
# If analysis fails, return original evidence
return evidence_list, f"Analysis failed: {str(e)}"
|
synthesize_evidence(claim, evidence_list)
async
Synthesize multiple pieces of evidence into a coherent summary.
Returns:
Source code in src/truthfulness_evaluator/llm/chains/evidence.py
| async def synthesize_evidence(self, claim: Claim, evidence_list: list[Evidence]) -> str:
"""
Synthesize multiple pieces of evidence into a coherent summary.
Returns:
Summary text
"""
if not evidence_list:
return "No evidence available"
# Filter to high-relevance evidence
good_evidence = [e for e in evidence_list if e.relevance_score >= 0.5][:3]
if not good_evidence:
return "No highly relevant evidence found"
# Simple synthesis without LLM for speed
parts = []
for e in good_evidence:
support = (
"supports"
if e.supports_claim
else "refutes" if e.supports_claim is False else "is neutral on"
)
parts.append(
f"{e.source} ({e.source_type}) {support} the claim with {e.relevance_score:.0%} relevance"
)
return "; ".join(parts)
|