Skip to content

Chains

Legacy API

The chains module is being replaced by the pluggable adapter architecture. New code should use the adapter classes in extractors/, gatherers/, verifiers/, and formatters/ packages instead. See the Adapters API Reference for the modern interface.

Claim Extraction

SimpleClaimExtractionChain

from truthfulness_evaluator.llm.chains.extraction import SimpleClaimExtractionChain

extractor = SimpleClaimExtractionChain(model="gpt-4o-mini")
claims = await extractor.extract(
    document="Python was created in 1991...",
    source_path="README.md",
    max_claims=10
)

Output: list[Claim]

Uses structured output (ClaimExtractionOutput):

class ClaimExtractionOutput(BaseModel):
    claims: list[ExtractedClaim]

class ExtractedClaim(BaseModel):
    text: str
    claim_type: str  # "explicit", "implicit", "inferred"

TripletExtractionChain

from truthfulness_evaluator.llm.chains.extraction import TripletExtractionChain

extractor = TripletExtractionChain(model="gpt-4o-mini")
claims = await extractor.extract(document, source_path)

Extracts subject-relation-object triplets.

Verification

VerificationChain

from truthfulness_evaluator.llm.chains.verification import VerificationChain

verifier = VerificationChain(model_name="gpt-4o")
result = await verifier.verify(claim, evidence)

Output: VerificationResult

Uses structured output (VerificationOutput):

class VerificationOutput(BaseModel):
    verdict: str           # "SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO"
    confidence: float      # 0.0 to 1.0
    reasoning: str
    key_evidence: str | None

Consensus

ConsensusChain

from truthfulness_evaluator.llm.chains.consensus import ConsensusChain

consensus = ConsensusChain(
    model_names=["gpt-4o", "gpt-4o-mini"],
    weights={"gpt-4o": 0.6, "gpt-4o-mini": 0.4},
    confidence_threshold=0.7
)

result = await consensus.verify(claim, evidence)

Weighted voting. Models vote, weights applied, majority wins.

ICEConsensusChain

from truthfulness_evaluator.llm.chains.consensus import ICEConsensusChain

ice = ICEConsensusChain(
    model_names=["gpt-4o", "gpt-4o-mini"],
    max_rounds=3
)

result = await ice.verify(claim, evidence)

Iterative Consensus Ensemble. Models critique each other over multiple rounds.

Evidence Processing

EvidenceProcessor

from truthfulness_evaluator.llm.chains.evidence import EvidenceProcessor

processor = EvidenceProcessor(model="gpt-4o-mini")

# Analyze evidence relevance
evidence, summary = await processor.analyze_evidence(claim, evidence_list)

# Synthesize multiple pieces
synthesis = await processor.synthesize_evidence(claim, evidence_list)

Output: Analyzed evidence with scores + summary text

Uses structured output (EvidenceAnalysisOutput):

class EvidenceAnalysisOutput(BaseModel):
    evidence_analysis: list[EvidenceAnalysisItem]
    summary: str

class EvidenceAnalysisItem(BaseModel):
    index: int
    relevance: float
    supports: bool | None
    credibility: float
    reasoning: str

Custom Chains

Build your own:

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Define output structure
class MyOutput(BaseModel):
    result: str
    confidence: float

# Create chain
prompt = ChatPromptTemplate.from_template("Verify: {claim}")
llm = ChatOpenAI(model="gpt-4o").with_structured_output(MyOutput)
chain = prompt | llm

# Use
result = await chain.ainvoke({"claim": "Python was created in 1991"})

API Reference

Claim Extraction

truthfulness_evaluator.llm.chains.extraction.SimpleClaimExtractionChain

Fallback claim extraction using structured LLM outputs.

Source code in src/truthfulness_evaluator/llm/chains/extraction.py
class SimpleClaimExtractionChain:
    """Fallback claim extraction using structured LLM outputs."""

    def __init__(self, model: str = "gpt-4o-mini"):
        self.model = model
        self._llm = None

    @property
    def llm(self):
        """Lazy initialization of LLM with structured output."""
        if self._llm is None:
            base_llm = create_chat_model(self.model, temperature=0)
            # Use structured output
            self._llm = base_llm.with_structured_output(ClaimExtractionOutput)
        return self._llm

    async def extract(
        self, document: str, source_path: str, max_claims: Optional[int] = None
    ) -> list[Claim]:
        """Extract claims using structured LLM output."""

        chain = CLAIM_EXTRACTION_PROMPT | self.llm

        try:
            result: ClaimExtractionOutput = await chain.ainvoke({"text": document})

            claims = []
            for i, extracted in enumerate(result.claims):
                if max_claims and i >= max_claims:
                    break

                if extracted.text.strip():
                    claims.append(
                        Claim(
                            id=f"claim_{i:03d}",
                            text=extracted.text.strip(),
                            source_document=source_path,
                            claim_type=(
                                extracted.claim_type
                                if extracted.claim_type in ["explicit", "implicit", "inferred"]
                                else "explicit"
                            ),
                        )
                    )

            return claims

        except Exception as e:
            logger.warning(f"Simple extraction failed: {e}")
            return []

llm property

Lazy initialization of LLM with structured output.

extract(document, source_path, max_claims=None) async

Extract claims using structured LLM output.

Source code in src/truthfulness_evaluator/llm/chains/extraction.py
async def extract(
    self, document: str, source_path: str, max_claims: Optional[int] = None
) -> list[Claim]:
    """Extract claims using structured LLM output."""

    chain = CLAIM_EXTRACTION_PROMPT | self.llm

    try:
        result: ClaimExtractionOutput = await chain.ainvoke({"text": document})

        claims = []
        for i, extracted in enumerate(result.claims):
            if max_claims and i >= max_claims:
                break

            if extracted.text.strip():
                claims.append(
                    Claim(
                        id=f"claim_{i:03d}",
                        text=extracted.text.strip(),
                        source_document=source_path,
                        claim_type=(
                            extracted.claim_type
                            if extracted.claim_type in ["explicit", "implicit", "inferred"]
                            else "explicit"
                        ),
                    )
                )

        return claims

    except Exception as e:
        logger.warning(f"Simple extraction failed: {e}")
        return []

truthfulness_evaluator.llm.chains.extraction.TripletExtractionChain

Extract claims as subject-relation-object triplets using structured output.

Source code in src/truthfulness_evaluator/llm/chains/extraction.py
class TripletExtractionChain:
    """Extract claims as subject-relation-object triplets using structured output."""

    def __init__(self, model: str = "gpt-4o-mini"):
        self.model = model
        self._llm = None

    @property
    def llm(self):
        """Lazy initialization of LLM with structured output."""
        if self._llm is None:
            base_llm = create_chat_model(self.model, temperature=0)
            self._llm = base_llm.with_structured_output(TripletExtractionOutput)
        return self._llm

    async def extract(
        self, document: str, source_path: str, max_claims: Optional[int] = None
    ) -> list[Claim]:
        """Extract claims as triplets using structured output."""

        chain = TRIPLET_EXTRACTION_PROMPT | self.llm

        try:
            result: TripletExtractionOutput = await chain.ainvoke({"text": document})

            claims = []
            for i, triplet in enumerate(result.triplets):
                if max_claims and i >= max_claims:
                    break

                claim_text = f"{triplet.subject} {triplet.relation} {triplet.object}".strip()

                if claim_text:
                    claims.append(
                        Claim(
                            id=f"claim_{i:03d}",
                            text=claim_text,
                            source_document=source_path,
                            context=triplet.context,
                            claim_type="explicit",
                        )
                    )

            return claims

        except Exception as e:
            logger.warning(f"Triplet extraction failed: {e}")
            return []

llm property

Lazy initialization of LLM with structured output.

extract(document, source_path, max_claims=None) async

Extract claims as triplets using structured output.

Source code in src/truthfulness_evaluator/llm/chains/extraction.py
async def extract(
    self, document: str, source_path: str, max_claims: Optional[int] = None
) -> list[Claim]:
    """Extract claims as triplets using structured output."""

    chain = TRIPLET_EXTRACTION_PROMPT | self.llm

    try:
        result: TripletExtractionOutput = await chain.ainvoke({"text": document})

        claims = []
        for i, triplet in enumerate(result.triplets):
            if max_claims and i >= max_claims:
                break

            claim_text = f"{triplet.subject} {triplet.relation} {triplet.object}".strip()

            if claim_text:
                claims.append(
                    Claim(
                        id=f"claim_{i:03d}",
                        text=claim_text,
                        source_document=source_path,
                        context=triplet.context,
                        claim_type="explicit",
                    )
                )

        return claims

    except Exception as e:
        logger.warning(f"Triplet extraction failed: {e}")
        return []

Verification

truthfulness_evaluator.llm.chains.verification.VerificationChain

Single-model verification chain with structured outputs.

Source code in src/truthfulness_evaluator/llm/chains/verification.py
class VerificationChain:
    """Single-model verification chain with structured outputs."""

    def __init__(self, model_name: str = "gpt-4o"):
        self.model_name = model_name
        self._llm = None

    @property
    def llm(self):
        """Lazy initialization of LLM with structured output."""
        if self._llm is None:
            base_llm = create_chat_model(self.model_name, temperature=0)
            # Use structured output
            self._llm = base_llm.with_structured_output(VerificationOutput)
        return self._llm

    async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
        """Verify a claim against evidence using structured output."""

        # Build evidence text with relevance indicators
        if evidence:
            evidence_parts = []
            for i, e in enumerate(evidence[:4], 1):  # Top 4 evidence items
                support_indicator = ""
                if e.supports_claim is True:
                    support_indicator = "[SUPPORTS]"
                elif e.supports_claim is False:
                    support_indicator = "[REFUTES]"
                else:
                    support_indicator = "[NEUTRAL]"

                evidence_parts.append(
                    f"\n--- Evidence {i} ({e.source_type}) {support_indicator} ---\n"
                    f"Source: {e.source}\n"
                    f"Relevance: {e.relevance_score:.0%}\n"
                    f"Content: {e.content[:600]}"
                )

            evidence_text = "\n".join(evidence_parts)
        else:
            evidence_text = "No evidence provided."

        chain = VERIFICATION_PROMPT | self.llm

        try:
            result: VerificationOutput = await chain.ainvoke(
                {"claim": claim.text, "evidence": evidence_text}
            )

            # Normalize verdict
            verdict = result.verdict.upper()
            if verdict not in ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO"]:
                verdict = "NOT_ENOUGH_INFO"

            # Combine reasoning with key evidence
            full_explanation = result.reasoning
            if result.key_evidence:
                full_explanation += f"\n\nKey evidence: {result.key_evidence}"

            # Adjust confidence based on evidence quality
            confidence = max(0.0, min(1.0, result.confidence))
            if not evidence:
                confidence = min(confidence, 0.3)

            return VerificationResult(
                claim_id=claim.id,
                verdict=verdict,
                confidence=confidence,
                evidence=evidence,
                explanation=full_explanation,
                model_votes={self.model_name: verdict},
            )

        except Exception as e:
            # Fallback result on error
            return VerificationResult(
                claim_id=claim.id,
                verdict="NOT_ENOUGH_INFO",
                confidence=0.0,
                evidence=evidence,
                explanation=f"Verification failed: {str(e)}",
                model_votes={self.model_name: "NOT_ENOUGH_INFO"},
            )

llm property

Lazy initialization of LLM with structured output.

verify(claim, evidence) async

Verify a claim against evidence using structured output.

Source code in src/truthfulness_evaluator/llm/chains/verification.py
async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
    """Verify a claim against evidence using structured output."""

    # Build evidence text with relevance indicators
    if evidence:
        evidence_parts = []
        for i, e in enumerate(evidence[:4], 1):  # Top 4 evidence items
            support_indicator = ""
            if e.supports_claim is True:
                support_indicator = "[SUPPORTS]"
            elif e.supports_claim is False:
                support_indicator = "[REFUTES]"
            else:
                support_indicator = "[NEUTRAL]"

            evidence_parts.append(
                f"\n--- Evidence {i} ({e.source_type}) {support_indicator} ---\n"
                f"Source: {e.source}\n"
                f"Relevance: {e.relevance_score:.0%}\n"
                f"Content: {e.content[:600]}"
            )

        evidence_text = "\n".join(evidence_parts)
    else:
        evidence_text = "No evidence provided."

    chain = VERIFICATION_PROMPT | self.llm

    try:
        result: VerificationOutput = await chain.ainvoke(
            {"claim": claim.text, "evidence": evidence_text}
        )

        # Normalize verdict
        verdict = result.verdict.upper()
        if verdict not in ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO"]:
            verdict = "NOT_ENOUGH_INFO"

        # Combine reasoning with key evidence
        full_explanation = result.reasoning
        if result.key_evidence:
            full_explanation += f"\n\nKey evidence: {result.key_evidence}"

        # Adjust confidence based on evidence quality
        confidence = max(0.0, min(1.0, result.confidence))
        if not evidence:
            confidence = min(confidence, 0.3)

        return VerificationResult(
            claim_id=claim.id,
            verdict=verdict,
            confidence=confidence,
            evidence=evidence,
            explanation=full_explanation,
            model_votes={self.model_name: verdict},
        )

    except Exception as e:
        # Fallback result on error
        return VerificationResult(
            claim_id=claim.id,
            verdict="NOT_ENOUGH_INFO",
            confidence=0.0,
            evidence=evidence,
            explanation=f"Verification failed: {str(e)}",
            model_votes={self.model_name: "NOT_ENOUGH_INFO"},
        )

Consensus

truthfulness_evaluator.llm.chains.consensus.ConsensusChain

Multi-model consensus with weighted voting.

Source code in src/truthfulness_evaluator/llm/chains/consensus.py
class ConsensusChain:
    """Multi-model consensus with weighted voting."""

    def __init__(
        self,
        model_names: list[str],
        weights: dict[str, float] | None = None,
        confidence_threshold: float = 0.7,
    ):
        self.model_names = model_names
        self.weights = weights or {m: 1.0 / len(model_names) for m in model_names}
        self.confidence_threshold = confidence_threshold
        self._chains = None

    @property
    def chains(self) -> list[VerificationChain]:
        """Lazy initialization of verification chains."""
        if self._chains is None:
            self._chains = [VerificationChain(m) for m in self.model_names]
        return self._chains

    async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
        """Verify claim using multi-model consensus."""
        # Get votes from all models in parallel
        results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])

        # Collect votes and confidences
        votes = {}
        confidences = {}
        explanations = []

        for i, result in enumerate(results):
            model = self.model_names[i]
            votes[model] = result.verdict
            confidences[model] = result.confidence
            explanations.append(f"{model}: {result.verdict} (confidence: {result.confidence:.2f})")

        # Weighted voting
        weighted_votes = Counter()
        for model, verdict in votes.items():
            weight = self.weights.get(model, 1.0 / len(self.model_names))
            weighted_votes[verdict] += weight

        # Get winning verdict
        final_verdict = weighted_votes.most_common(1)[0][0]

        # Calculate overall confidence
        avg_confidence = sum(confidences.values()) / len(confidences)

        # If confidence too low, mark as NEI
        if avg_confidence < self.confidence_threshold:
            final_verdict = "NOT_ENOUGH_INFO"

        # Debug: show all votes
        vote_str = ", ".join([f"{m}: {v}" for m, v in votes.items()])
        logger.debug(f"Model votes: {vote_str}")

        # Combine evidence from all results
        all_evidence = []
        for r in results:
            all_evidence.extend(r.evidence)

        return VerificationResult(
            claim_id=claim.id,
            verdict=final_verdict,
            confidence=avg_confidence,
            evidence=all_evidence[:5],  # Deduplicate and limit
            explanation="\n".join([f"Consensus: {final_verdict}", "Model votes:", *explanations]),
            model_votes=votes,
        )

chains: list[VerificationChain] property

Lazy initialization of verification chains.

verify(claim, evidence) async

Verify claim using multi-model consensus.

Source code in src/truthfulness_evaluator/llm/chains/consensus.py
async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
    """Verify claim using multi-model consensus."""
    # Get votes from all models in parallel
    results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])

    # Collect votes and confidences
    votes = {}
    confidences = {}
    explanations = []

    for i, result in enumerate(results):
        model = self.model_names[i]
        votes[model] = result.verdict
        confidences[model] = result.confidence
        explanations.append(f"{model}: {result.verdict} (confidence: {result.confidence:.2f})")

    # Weighted voting
    weighted_votes = Counter()
    for model, verdict in votes.items():
        weight = self.weights.get(model, 1.0 / len(self.model_names))
        weighted_votes[verdict] += weight

    # Get winning verdict
    final_verdict = weighted_votes.most_common(1)[0][0]

    # Calculate overall confidence
    avg_confidence = sum(confidences.values()) / len(confidences)

    # If confidence too low, mark as NEI
    if avg_confidence < self.confidence_threshold:
        final_verdict = "NOT_ENOUGH_INFO"

    # Debug: show all votes
    vote_str = ", ".join([f"{m}: {v}" for m, v in votes.items()])
    logger.debug(f"Model votes: {vote_str}")

    # Combine evidence from all results
    all_evidence = []
    for r in results:
        all_evidence.extend(r.evidence)

    return VerificationResult(
        claim_id=claim.id,
        verdict=final_verdict,
        confidence=avg_confidence,
        evidence=all_evidence[:5],  # Deduplicate and limit
        explanation="\n".join([f"Consensus: {final_verdict}", "Model votes:", *explanations]),
        model_votes=votes,
    )

truthfulness_evaluator.llm.chains.consensus.ICEConsensusChain

Iterative Consensus Ensemble - models critique each other.

Source code in src/truthfulness_evaluator/llm/chains/consensus.py
class ICEConsensusChain:
    """Iterative Consensus Ensemble - models critique each other."""

    def __init__(
        self, model_names: list[str], max_rounds: int = 3, confidence_threshold: float = 0.7
    ):
        self.model_names = model_names
        self.max_rounds = max_rounds
        self.confidence_threshold = confidence_threshold
        self._chains = None

    @property
    def chains(self) -> list[VerificationChain]:
        """Lazy initialization of verification chains."""
        if self._chains is None:
            self._chains = [VerificationChain(m) for m in self.model_names]
        return self._chains

    async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
        """Verify claim using ICE (Iterative Consensus Ensemble)."""

        # Round 1: Initial votes
        results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])

        votes = {self.model_names[i]: r.verdict for i, r in enumerate(results)}

        # Rounds 2-N: Critique and revise
        for round_num in range(2, self.max_rounds + 1):
            if self._consensus_reached(votes):
                break

            # Models critique each other's reasoning
            critiques = await self._gather_critiques(claim, evidence, votes, round_num)

            # Revise votes based on critiques
            new_results = await asyncio.gather(
                *[
                    self._revise_vote(chain, claim, evidence, votes, critiques, round_num)
                    for chain in self.chains
                ]
            )

            votes = {self.model_names[i]: r.verdict for i, r in enumerate(new_results)}

        # Final aggregation
        return self._aggregate_results(claim, votes, results, evidence)

    def _consensus_reached(self, votes: dict[str, Verdict]) -> bool:
        """Check if all models agree."""
        return len(set(votes.values())) == 1

    async def _gather_critiques(
        self, claim: Claim, evidence: list[Evidence], votes: dict[str, Verdict], round_num: int
    ) -> dict[str, str]:
        """Gather critiques from each model about others' reasoning."""
        # Simplified - in full implementation, each model critiques others
        return {model: f"Round {round_num} critique" for model in self.model_names}

    async def _revise_vote(
        self,
        chain: VerificationChain,
        claim: Claim,
        evidence: list[Evidence],
        current_votes: dict[str, Verdict],
        critiques: dict[str, str],
        round_num: int,
    ) -> VerificationResult:
        """Revise vote based on critiques."""
        # For now, just re-verify (full implementation would incorporate critiques)
        return await chain.verify(claim, evidence)

    def _aggregate_results(
        self,
        claim: Claim,
        votes: dict[str, Verdict],
        results: list[VerificationResult],
        evidence: list[Evidence],
    ) -> VerificationResult:
        """Aggregate final results."""
        # Simple majority vote
        vote_counts = Counter(votes.values())
        final_verdict = vote_counts.most_common(1)[0][0]

        # Average confidence
        avg_confidence = sum(r.confidence for r in results) / len(results)

        if avg_confidence < self.confidence_threshold:
            final_verdict = "NOT_ENOUGH_INFO"

        return VerificationResult(
            claim_id=claim.id,
            verdict=final_verdict,
            confidence=avg_confidence,
            evidence=evidence,
            explanation=f"ICE Consensus after up to {self.max_rounds} rounds",
            model_votes=votes,
        )

chains: list[VerificationChain] property

Lazy initialization of verification chains.

verify(claim, evidence) async

Verify claim using ICE (Iterative Consensus Ensemble).

Source code in src/truthfulness_evaluator/llm/chains/consensus.py
async def verify(self, claim: Claim, evidence: list[Evidence]) -> VerificationResult:
    """Verify claim using ICE (Iterative Consensus Ensemble)."""

    # Round 1: Initial votes
    results = await asyncio.gather(*[chain.verify(claim, evidence) for chain in self.chains])

    votes = {self.model_names[i]: r.verdict for i, r in enumerate(results)}

    # Rounds 2-N: Critique and revise
    for round_num in range(2, self.max_rounds + 1):
        if self._consensus_reached(votes):
            break

        # Models critique each other's reasoning
        critiques = await self._gather_critiques(claim, evidence, votes, round_num)

        # Revise votes based on critiques
        new_results = await asyncio.gather(
            *[
                self._revise_vote(chain, claim, evidence, votes, critiques, round_num)
                for chain in self.chains
            ]
        )

        votes = {self.model_names[i]: r.verdict for i, r in enumerate(new_results)}

    # Final aggregation
    return self._aggregate_results(claim, votes, results, evidence)

Evidence Processing

truthfulness_evaluator.llm.chains.evidence.EvidenceProcessor

Process and analyze evidence for claims using structured outputs.

Source code in src/truthfulness_evaluator/llm/chains/evidence.py
class EvidenceProcessor:
    """Process and analyze evidence for claims using structured outputs."""

    def __init__(self, model: str = "gpt-4o-mini"):
        self.model = model
        self._llm = None

    @property
    def llm(self):
        """Lazy initialization of LLM with structured output."""
        if self._llm is None:
            base_llm = create_chat_model(self.model, temperature=0)
            self._llm = base_llm.with_structured_output(EvidenceAnalysisOutput)
        return self._llm

    async def analyze_evidence(
        self, claim: Claim, evidence_list: list[Evidence]
    ) -> tuple[list[Evidence], str]:
        """
        Analyze evidence and determine which pieces are relevant.

        Returns:
            Tuple of (filtered_evidence, analysis_summary)
        """
        if not evidence_list:
            return [], "No evidence provided"

        # Build evidence text
        evidence_text = "\n\n---\n\n".join(
            [
                f"[{i}] Source: {e.source}\nType: {e.source_type}\nContent: {e.content[:800]}"
                for i, e in enumerate(evidence_list[:5])  # Top 5 pieces
            ]
        )

        chain = EVIDENCE_ANALYSIS_PROMPT | self.llm

        try:
            result: EvidenceAnalysisOutput = await chain.ainvoke(
                {"claim": claim.text, "evidence": evidence_text}
            )

            # Update evidence with analysis
            for analysis in result.evidence_analysis:
                idx = analysis.index
                if 0 <= idx < len(evidence_list):
                    evidence_list[idx].relevance_score = max(0.0, min(1.0, analysis.relevance))
                    evidence_list[idx].supports_claim = analysis.supports
                    evidence_list[idx].credibility_score = max(0.0, min(1.0, analysis.credibility))

            # Sort by relevance
            evidence_list.sort(key=lambda e: e.relevance_score, reverse=True)

            return evidence_list, result.summary

        except Exception as e:
            # If analysis fails, return original evidence
            return evidence_list, f"Analysis failed: {str(e)}"

    async def synthesize_evidence(self, claim: Claim, evidence_list: list[Evidence]) -> str:
        """
        Synthesize multiple pieces of evidence into a coherent summary.

        Returns:
            Summary text
        """
        if not evidence_list:
            return "No evidence available"

        # Filter to high-relevance evidence
        good_evidence = [e for e in evidence_list if e.relevance_score >= 0.5][:3]

        if not good_evidence:
            return "No highly relevant evidence found"

        # Simple synthesis without LLM for speed
        parts = []
        for e in good_evidence:
            support = (
                "supports"
                if e.supports_claim
                else "refutes" if e.supports_claim is False else "is neutral on"
            )
            parts.append(
                f"{e.source} ({e.source_type}) {support} the claim with {e.relevance_score:.0%} relevance"
            )

        return "; ".join(parts)

llm property

Lazy initialization of LLM with structured output.

analyze_evidence(claim, evidence_list) async

Analyze evidence and determine which pieces are relevant.

Returns:

Type Description
tuple[list[Evidence], str]

Tuple of (filtered_evidence, analysis_summary)

Source code in src/truthfulness_evaluator/llm/chains/evidence.py
async def analyze_evidence(
    self, claim: Claim, evidence_list: list[Evidence]
) -> tuple[list[Evidence], str]:
    """
    Analyze evidence and determine which pieces are relevant.

    Returns:
        Tuple of (filtered_evidence, analysis_summary)
    """
    if not evidence_list:
        return [], "No evidence provided"

    # Build evidence text
    evidence_text = "\n\n---\n\n".join(
        [
            f"[{i}] Source: {e.source}\nType: {e.source_type}\nContent: {e.content[:800]}"
            for i, e in enumerate(evidence_list[:5])  # Top 5 pieces
        ]
    )

    chain = EVIDENCE_ANALYSIS_PROMPT | self.llm

    try:
        result: EvidenceAnalysisOutput = await chain.ainvoke(
            {"claim": claim.text, "evidence": evidence_text}
        )

        # Update evidence with analysis
        for analysis in result.evidence_analysis:
            idx = analysis.index
            if 0 <= idx < len(evidence_list):
                evidence_list[idx].relevance_score = max(0.0, min(1.0, analysis.relevance))
                evidence_list[idx].supports_claim = analysis.supports
                evidence_list[idx].credibility_score = max(0.0, min(1.0, analysis.credibility))

        # Sort by relevance
        evidence_list.sort(key=lambda e: e.relevance_score, reverse=True)

        return evidence_list, result.summary

    except Exception as e:
        # If analysis fails, return original evidence
        return evidence_list, f"Analysis failed: {str(e)}"

synthesize_evidence(claim, evidence_list) async

Synthesize multiple pieces of evidence into a coherent summary.

Returns:

Type Description
str

Summary text

Source code in src/truthfulness_evaluator/llm/chains/evidence.py
async def synthesize_evidence(self, claim: Claim, evidence_list: list[Evidence]) -> str:
    """
    Synthesize multiple pieces of evidence into a coherent summary.

    Returns:
        Summary text
    """
    if not evidence_list:
        return "No evidence available"

    # Filter to high-relevance evidence
    good_evidence = [e for e in evidence_list if e.relevance_score >= 0.5][:3]

    if not good_evidence:
        return "No highly relevant evidence found"

    # Simple synthesis without LLM for speed
    parts = []
    for e in good_evidence:
        support = (
            "supports"
            if e.supports_claim
            else "refutes" if e.supports_claim is False else "is neutral on"
        )
        parts.append(
            f"{e.source} ({e.source_type}) {support} the claim with {e.relevance_score:.0%} relevance"
        )

    return "; ".join(parts)