Source code for ExposoGraph.db_clients.kegg

"""KEGG REST API client for pathway and enzyme lookups.

Uses the public KEGG REST API (https://rest.kegg.jp/) to retrieve
pathway membership, enzyme annotations, and gene-pathway mappings.
No API key is required for the public endpoints.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Any

logger = logging.getLogger(__name__)

_BASE_URL = "https://rest.kegg.jp"


[docs] @dataclass class KEGGPathway: """Minimal representation of a KEGG pathway.""" pathway_id: str name: str genes: list[str] = field(default_factory=list)
[docs] @dataclass class KEGGGene: """Minimal representation of a KEGG gene entry.""" gene_id: str symbol: str name: str = "" pathways: list[str] = field(default_factory=list)
[docs] class KEGGClient: """Lightweight client for the KEGG REST API. Parameters ---------- base_url: Override the KEGG REST base URL (useful for testing). timeout: HTTP request timeout in seconds. """ def __init__( self, base_url: str = _BASE_URL, timeout: int = 30, ) -> None: try: import requests as _requests # noqa: F401 except ModuleNotFoundError as exc: # pragma: no cover raise RuntimeError( "The 'requests' package is required for KEGG lookups. " "Install with: pip install ExposoGraph[db]" ) from exc self.base_url = base_url.rstrip("/") self.timeout = timeout def _get(self, path: str) -> str: """Perform a GET request and return the response text.""" import requests url = f"{self.base_url}/{path}" resp = requests.get(url, timeout=self.timeout) resp.raise_for_status() return str(resp.text) @staticmethod def _field_body(line: str) -> str: """Return the content area of a KEGG fixed-width record line.""" return line[12:].strip() if len(line) > 12 else "" @staticmethod def _parse_pathway_gene(body: str) -> str | None: """Extract the gene symbol from a KEGG pathway GENE line.""" if not body: return None parts = body.split(None, 2) if not parts: return None if parts[0].isdigit(): return parts[1].rstrip(";") if len(parts) > 1 else None return parts[0].rstrip(";")
[docs] def get_pathway(self, pathway_id: str) -> KEGGPathway: """Fetch pathway details including member genes. Parameters ---------- pathway_id: KEGG pathway identifier, e.g. ``"hsa05204"`` or ``"path:hsa05204"``. """ clean_id = pathway_id.replace("path:", "") text = self._get(f"get/{clean_id}") name = "" genes: list[str] = [] in_gene_section = False for line in text.splitlines(): if line.startswith("NAME"): name = self._field_body(line) elif line.startswith("GENE"): in_gene_section = True gene_symbol = self._parse_pathway_gene(self._field_body(line)) if gene_symbol: genes.append(gene_symbol) elif in_gene_section and line.startswith(" "): gene_symbol = self._parse_pathway_gene(self._field_body(line)) if gene_symbol: genes.append(gene_symbol) elif in_gene_section and not line.startswith(" "): in_gene_section = False return KEGGPathway(pathway_id=clean_id, name=name, genes=genes)
[docs] def get_gene(self, gene_id: str) -> KEGGGene: """Fetch a KEGG gene entry. Parameters ---------- gene_id: KEGG gene identifier, e.g. ``"hsa:1543"`` for CYP1A1. """ text = self._get(f"get/{gene_id}") symbol = "" name = "" pathways: list[str] = [] in_pathway_section = False for line in text.splitlines(): if line.startswith("SYMBOL"): symbol = self._field_body(line) elif line.startswith("NAME"): name = self._field_body(line) elif line.startswith("PATHWAY"): in_pathway_section = True body = self._field_body(line) if body: pathways.append(body.split(None, 1)[0]) elif in_pathway_section and line.startswith(" "): body = self._field_body(line) if body: pathways.append(body.split(None, 1)[0]) elif in_pathway_section and not line.startswith(" "): in_pathway_section = False return KEGGGene(gene_id=gene_id, symbol=symbol, name=name, pathways=pathways)
[docs] def find_genes(self, query: str, organism: str = "hsa") -> list[dict[str, str]]: """Search KEGG for genes matching a query string. Returns a list of ``{"gene_id": ..., "description": ...}`` dicts. """ text = self._get(f"find/{organism}/{query}") results: list[dict[str, str]] = [] for line in text.strip().splitlines(): if not line.strip(): continue parts = line.split("\t", 1) results.append({ "gene_id": parts[0].strip(), "description": parts[1].strip() if len(parts) > 1 else "", }) return results
[docs] def list_pathway_genes(self, pathway_id: str) -> list[str]: """Return gene IDs belonging to a pathway via the ``/link`` endpoint. Parameters ---------- pathway_id: KEGG pathway identifier, e.g. ``"hsa05204"``. """ clean_id = pathway_id.replace("path:", "") text = self._get(f"link/genes/{clean_id}") genes: list[str] = [] for line in text.strip().splitlines(): if not line.strip(): continue parts = line.split("\t") if len(parts) >= 2: genes.append(parts[1].strip()) return genes