Source code for ExposoGraph.llm_extractor

"""LLM-powered entity / relation extraction for the knowledge graph.

Supports multiple LLM backends (OpenAI, Ollama) via a pluggable protocol.
Falls back to JSON-mode parsing when structured output is unavailable.
"""

from __future__ import annotations

import json
import logging
import os
from typing import Optional

from .config import GraphMode
from .grounding import prepare_knowledge_graph
from .llm_backend import LLMBackend, OpenAIBackend, UsageRecord
from .models import KnowledgeGraph, RecordOrigin

logger = logging.getLogger(__name__)

SYSTEM_PROMPT = """\
You are an expert biochemist and toxicologist. Your task is to extract a
structured knowledge graph from the user's natural-language description of
carcinogen metabolism, gene interactions, and DNA damage pathways.

Return **only** valid JSON matching the schema below. Do not include any
text outside the JSON block.

### Node types (use exactly these strings for "type"):
- Carcinogen  — chemical agents; include "group" (e.g. PAH, HCA, Aromatic_Amine,
  Nitrosamine, Mycotoxin, Estrogen, Androgen, Solvent, Alkylating) and "iarc"
  classification (Group 1 / 2A / 2B / 3).
- Enzyme      — include "phase" (I, II, III, when applicable) and "role"
  (Activation, Detoxification, Mixed, Transport, Repair). For DNA repair
  proteins, use "role": "Repair" and store the repair class in "group"
  (for example "DNA Repair (BER)" or "DNA Repair (NER)") instead of using
  "phase": "Repair".
- Gene        — a gene locus (use when distinguishing the gene from its encoded enzyme,
  e.g. for pharmacogenomic variants or tissue expression context).
- Metabolite  — include "reactivity" (High, Intermediate, Low).
- DNA_Adduct  — DNA lesion types.
- Pathway     — biological pathways; use KEGG IDs when possible.
- Tissue      — anatomical tissue or organ where expression/metabolism occurs.

### Edge types (use exactly these strings for "type"):
- ACTIVATES     — enzyme activates a procarcinogen → reactive metabolite
- DETOXIFIES    — enzyme conjugates / inactivates a metabolite
- TRANSPORTS    — efflux transporter moves a conjugate out of the cell
- FORMS_ADDUCT  — reactive metabolite covalently modifies DNA
- REPAIRS       — DNA repair enzyme removes a lesion
- PATHWAY       — node belongs to a biological pathway
- EXPRESSED_IN  — gene or enzyme is expressed in a tissue
- INDUCES       — substance or exposure induces enzyme expression/activity
- INHIBITS      — substance or exposure inhibits enzyme expression/activity
- ENCODES       — gene encodes an enzyme

### JSON Schema:
{
  "nodes": [
    {
      "id": "<short_unique_id>",
      "label": "<display name>",
      "type": "<NodeType>",
      "detail": "<one-line description>",
      "group": "<carcinogen class or repair class, or null>",
      "iarc": "<IARC group or null>",
      "phase": "<enzyme phase or null>",
      "role": "<enzyme role or null>",
      "reactivity": "<metabolite reactivity or null>",
      "source_db": "<supporting database(s) such as NCBI Gene, GTEx, ClinPGx, CTD, IARC, or KEGG, or null>",
      "evidence": "<brief evidence note or null>",
      "pmid": "<PubMed ID or null>",
      "tissue": "<relevant tissue context or null>",
      "variant": "<star allele or variant name or null>",
      "phenotype": "<functional phenotype such as poor metabolizer or null>",
      "activity_score": "<numeric activity score or null>",
      "tier": "<gene panel tier: 1, 2, or null>"
    }
  ],
  "edges": [
    {
      "source": "<source node id>",
      "target": "<target node id>",
      "type": "<EdgeType>",
      "label": "<short description of the reaction>",
      "carcinogen": "<id of the parent carcinogen, if applicable, or null>",
      "source_db": "<supporting database(s) such as NCBI Gene, CTD, IARC, or KEGG, or null>",
      "evidence": "<brief evidence note or null>",
      "pmid": "<PubMed ID or null>",
      "tissue": "<relevant tissue context or null>"
    }
  ]
}

Guidelines:
- Generate concise, uppercase-safe IDs (e.g. "BaP", "CYP1A1", "BPDE_dG").
- Every edge's source and target MUST reference an id that exists in the nodes list.
- Include the full metabolic chain: activation → metabolite → adduct → repair.
- Also include detoxification / conjugation branches when mentioned.
- If the user mentions KEGG pathway IDs, include Pathway nodes.
- Add annotation fields only when supported by the text; otherwise return null.
- Use `source_db` to reflect database-style provenance such as NCBI Gene, GTEx, ClinPGx, CTD, IARC, and KEGG.
- Capture tissue specificity, pharmacogenomic variants, and metabolizer phenotype when the text provides them.
"""



[docs]
def extract_graph(
    text: str,
    *,
    model: str = "gpt-4o",
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    backend: Optional[LLMBackend] = None,
    mode: GraphMode | str = GraphMode.EXPLORATORY,
) -> KnowledgeGraph:
    """Send *text* to the LLM and return a validated KnowledgeGraph.

    If *backend* is provided it is used directly; otherwise an
    :class:`OpenAIBackend` is created from the given credentials.
    """
    result, _usage = extract_graph_with_usage(
        text, model=model, api_key=api_key, base_url=base_url, backend=backend, mode=mode,
    )
    return result



def extract_graph_with_usage(
    text: str,
    *,
    model: str = "gpt-4o",
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    backend: Optional[LLMBackend] = None,
    mode: GraphMode | str = GraphMode.EXPLORATORY,
) -> tuple[KnowledgeGraph, UsageRecord]:
    """Like :func:`extract_graph` but also returns token usage metadata."""
    if backend is None:
        backend = OpenAIBackend(api_key=api_key, base_url=base_url)

    raw, usage = backend.extract_json(text, SYSTEM_PROMPT, model)
    kg = KnowledgeGraph(**raw)
    kg = KnowledgeGraph(
        nodes=[
            node.model_copy(update={"origin": RecordOrigin.LLM})
            for node in kg.nodes
        ],
        edges=[
            edge.model_copy(update={"origin": RecordOrigin.LLM})
            for edge in kg.edges
        ],
    )
    prepared_graph, warnings = prepare_knowledge_graph(kg, mode=mode)
    for warning in warnings:
        logger.warning(warning)
    return prepared_graph, usage


EXAMPLE_INPUT = """\
Benzo[a]pyrene (BaP) is a Group 1 PAH carcinogen found in tobacco smoke.
CYP1A1 and CYP1B1 epoxidize BaP to BaP-7,8-epoxide (high reactivity).
EPHX1 hydrolyzes the epoxide to BaP-7,8-diol (intermediate reactivity).
A second epoxidation by CYP1A1 produces the ultimate carcinogen BPDE
(high reactivity), which forms BPDE-N2-dG DNA adducts repaired by
nucleotide excision repair enzymes XPC and ERCC2/XPD.
Detoxification is handled by GSTM1 and GSTP1, which conjugate BPDE
with glutathione to form BPDE-GSH (low reactivity), effluxed by ABCB1
and ABCC2. BaP also generates 8-oxo-dG via ROS, repaired by OGG1.
BaP maps to KEGG pathways 05204 (Chemical Carcinogenesis — DNA adducts)
and 00980 (Xenobiotic metabolism by CYP450).
"""