Source code for ExposoGraph.db_clients.kegg

"""KEGG REST API client for pathway and enzyme lookups.

Uses the public KEGG REST API (https://rest.kegg.jp/) to retrieve
pathway membership, enzyme annotations, and gene-pathway mappings.
No API key is required for the public endpoints.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Any

logger = logging.getLogger(__name__)

_BASE_URL = "https://rest.kegg.jp"



[docs]
@dataclass
class KEGGPathway:
    """Minimal representation of a KEGG pathway."""

    pathway_id: str
    name: str
    genes: list[str] = field(default_factory=list)




[docs]
@dataclass
class KEGGGene:
    """Minimal representation of a KEGG gene entry."""

    gene_id: str
    symbol: str
    name: str = ""
    pathways: list[str] = field(default_factory=list)




[docs]
class KEGGClient:
    """Lightweight client for the KEGG REST API.

    Parameters
    ----------
    base_url:
        Override the KEGG REST base URL (useful for testing).
    timeout:
        HTTP request timeout in seconds.
    """

    def __init__(
        self,
        base_url: str = _BASE_URL,
        timeout: int = 30,
    ) -> None:
        try:
            import requests as _requests  # noqa: F401
        except ModuleNotFoundError as exc:  # pragma: no cover
            raise RuntimeError(
                "The 'requests' package is required for KEGG lookups. "
                "Install with: pip install ExposoGraph[db]"
            ) from exc
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout

    def _get(self, path: str) -> str:
        """Perform a GET request and return the response text."""
        import requests

        url = f"{self.base_url}/{path}"
        resp = requests.get(url, timeout=self.timeout)
        resp.raise_for_status()
        return str(resp.text)

    @staticmethod
    def _field_body(line: str) -> str:
        """Return the content area of a KEGG fixed-width record line."""
        return line[12:].strip() if len(line) > 12 else ""

    @staticmethod
    def _parse_pathway_gene(body: str) -> str | None:
        """Extract the gene symbol from a KEGG pathway GENE line."""
        if not body:
            return None
        parts = body.split(None, 2)
        if not parts:
            return None
        if parts[0].isdigit():
            return parts[1].rstrip(";") if len(parts) > 1 else None
        return parts[0].rstrip(";")


[docs]
    def get_pathway(self, pathway_id: str) -> KEGGPathway:
        """Fetch pathway details including member genes.

        Parameters
        ----------
        pathway_id:
            KEGG pathway identifier, e.g. ``"hsa05204"`` or ``"path:hsa05204"``.
        """
        clean_id = pathway_id.replace("path:", "")
        text = self._get(f"get/{clean_id}")
        name = ""
        genes: list[str] = []
        in_gene_section = False

        for line in text.splitlines():
            if line.startswith("NAME"):
                name = self._field_body(line)
            elif line.startswith("GENE"):
                in_gene_section = True
                gene_symbol = self._parse_pathway_gene(self._field_body(line))
                if gene_symbol:
                    genes.append(gene_symbol)
            elif in_gene_section and line.startswith("            "):
                gene_symbol = self._parse_pathway_gene(self._field_body(line))
                if gene_symbol:
                    genes.append(gene_symbol)
            elif in_gene_section and not line.startswith(" "):
                in_gene_section = False

        return KEGGPathway(pathway_id=clean_id, name=name, genes=genes)



[docs]
    def get_gene(self, gene_id: str) -> KEGGGene:
        """Fetch a KEGG gene entry.

        Parameters
        ----------
        gene_id:
            KEGG gene identifier, e.g. ``"hsa:1543"`` for CYP1A1.
        """
        text = self._get(f"get/{gene_id}")
        symbol = ""
        name = ""
        pathways: list[str] = []
        in_pathway_section = False

        for line in text.splitlines():
            if line.startswith("SYMBOL"):
                symbol = self._field_body(line)
            elif line.startswith("NAME"):
                name = self._field_body(line)
            elif line.startswith("PATHWAY"):
                in_pathway_section = True
                body = self._field_body(line)
                if body:
                    pathways.append(body.split(None, 1)[0])
            elif in_pathway_section and line.startswith("            "):
                body = self._field_body(line)
                if body:
                    pathways.append(body.split(None, 1)[0])
            elif in_pathway_section and not line.startswith(" "):
                in_pathway_section = False

        return KEGGGene(gene_id=gene_id, symbol=symbol, name=name, pathways=pathways)



[docs]
    def find_genes(self, query: str, organism: str = "hsa") -> list[dict[str, str]]:
        """Search KEGG for genes matching a query string.

        Returns a list of ``{"gene_id": ..., "description": ...}`` dicts.
        """
        text = self._get(f"find/{organism}/{query}")
        results: list[dict[str, str]] = []
        for line in text.strip().splitlines():
            if not line.strip():
                continue
            parts = line.split("\t", 1)
            results.append({
                "gene_id": parts[0].strip(),
                "description": parts[1].strip() if len(parts) > 1 else "",
            })
        return results



[docs]
    def list_pathway_genes(self, pathway_id: str) -> list[str]:
        """Return gene IDs belonging to a pathway via the ``/link`` endpoint.

        Parameters
        ----------
        pathway_id:
            KEGG pathway identifier, e.g. ``"hsa05204"``.
        """
        clean_id = pathway_id.replace("path:", "")
        text = self._get(f"link/genes/{clean_id}")
        genes: list[str] = []
        for line in text.strip().splitlines():
            if not line.strip():
                continue
            parts = line.split("\t")
            if len(parts) >= 2:
                genes.append(parts[1].strip())
        return genes