Source code for sceleto.network._corr_db
"""Pre-computed correlation database loader.
Each corr DB on disk follows the layout::
{data_dir}/{name}_corr_{CT}_{version}.npy # float32, p × p, mmap-friendly
{data_dir}/{name}_gene_names_{version}.npy # str array, length p
{data_dir}/{name}_n_obs_{version}.json # {CT: n_obs}
PANGEA ships as ``name="pangea"`` / ``version="v03"`` with cell types
``{B, ENDO, EPI, FIBRO, MACRO, T}``. Any other DB built via ``build_corr_db``
follows the same layout and is loaded by overriding ``name``/``version``.
``load_corr_db(gene)`` uses memory-mapped npy files to read only the gene's
row from each cell type — O(p × 4 B) per cell type regardless of matrix size.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Optional
import numpy as np
import pandas as pd
from scipy.stats import t as student_t
# ─────────────────────────────────────────────────────────────────────────────
# Default DB identifiers (PANGEA)
# ─────────────────────────────────────────────────────────────────────────────
PANGEA_NAME = "pangea"
PANGEA_VERSION = "v03"
PANGEA_CELL_TYPES = ["B", "ENDO", "EPI", "FIBRO", "MACRO", "T"]
# ─────────────────────────────────────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────────────────────────────────────
[docs]
def list_cell_types(
data_dir: str | Path | None = None,
name: str = PANGEA_NAME,
version: str = PANGEA_VERSION,
) -> list[str]:
"""Return available cell types in a corr database.
Parameters
----------
data_dir
Directory containing the DB. If ``None``, returns the PANGEA defaults
without touching disk.
name, version
DB identifiers (only used when ``data_dir`` is given).
Returns
-------
list[str]
Cell type keys (read from ``{name}_n_obs_{version}.json``).
"""
if data_dir is None:
return list(PANGEA_CELL_TYPES)
meta_path = Path(data_dir) / f"{name}_n_obs_{version}.json"
if not meta_path.exists():
raise FileNotFoundError(f"{meta_path} not found")
with open(meta_path) as f:
n_obs_map = json.load(f)
return list(n_obs_map.keys())
[docs]
def load_corr_db(
gene: str,
data_dir: str | Path,
cell_types: Optional[list[str]] = None,
name: str = PANGEA_NAME,
version: str = PANGEA_VERSION,
) -> pd.DataFrame:
"""Load pre-computed correlations for a gene of interest.
Uses memory-mapped npy files for fast random row access.
Parameters
----------
gene
Gene name (must exist in the corr database).
data_dir
Directory containing ``{name}_corr_{CT}_{version}.npy``,
``{name}_gene_names_{version}.npy``, and ``{name}_n_obs_{version}.json``.
cell_types
Subset of cell types to load. ``None`` = all (auto-discovered from
``{name}_n_obs_{version}.json``).
name, version
DB identifiers. Defaults are PANGEA (``"pangea"`` / ``"v03"``).
Returns
-------
pd.DataFrame
Wide table: ``gene`` + ``{CT}_corr`` + ``{CT}_pval`` per cell type.
Compatible with :func:`select_top_genes`.
"""
data_dir = Path(data_dir)
# Load n_obs metadata — also gives us the full cell type list
meta_path = data_dir / f"{name}_n_obs_{version}.json"
if not meta_path.exists():
raise FileNotFoundError(f"{meta_path} not found")
with open(meta_path) as f:
n_obs_map = json.load(f)
available_cts = list(n_obs_map.keys())
cts = cell_types or available_cts
unknown = [ct for ct in cts if ct not in available_cts]
if unknown:
raise ValueError(
f"Unknown cell type(s) {unknown}. Available: {available_cts}"
)
# Load shared gene names
gn_path = data_dir / f"{name}_gene_names_{version}.npy"
if not gn_path.exists():
raise FileNotFoundError(f"{gn_path} not found")
gene_names = np.load(gn_path, allow_pickle=True)
idx_arr = np.where(gene_names == gene)[0]
if idx_arr.size == 0:
raise ValueError(f"Gene {gene!r} not found in corr database")
idx = int(idx_arr[0])
merged: Optional[pd.DataFrame] = None
for ct in cts:
npy_path = data_dir / f"{name}_corr_{ct}_{version}.npy"
if not npy_path.exists():
raise FileNotFoundError(f"{npy_path} not found")
# mmap: only reads the requested row from disk
corr = np.load(npy_path, mmap_mode="r")
r_vals = corr[idx].astype(np.float32)
n_obs = n_obs_map[ct]
dfree = n_obs - 2
# Compute p-values on the fly
r_clipped = np.clip(r_vals, -0.9999999, 0.9999999)
valid = np.abs(r_clipped) < 1.0
p_vals = np.zeros(len(r_vals), dtype=np.float32)
t_vals = r_clipped[valid] * np.sqrt(dfree / (1 - r_clipped[valid] ** 2))
p_vals[valid] = (2 * student_t.sf(np.abs(t_vals), dfree)).astype(np.float32)
sub = pd.DataFrame({
"gene": gene_names,
f"{ct}_corr": r_vals,
f"{ct}_pval": p_vals,
})
merged = sub if merged is None else merged.merge(sub, on="gene", how="outer")
return merged.reset_index(drop=True)