Skip to content

MultiomicsStore

quantnado.dataset.store_multiomics.MultiomicsStore

MultiomicsStore(store_dir: Path | str)

Unified multi-modal genomic store combining coverage (BAM), methylation (bedGraph), and variant (VCF) data in a single directory.

On-disk layout::

<store_dir>/
    coverage.zarr/      (present if BAM files were provided)
    methylation.zarr/   (present if bedGraph files were provided)
    variants.zarr/      (present if VCF files were provided)

Each sub-store is a self-contained Zarr archive; access them individually via :attr:coverage, :attr:methylation, and :attr:variants, or use the high-level helpers on this class.

Example

ms = MultiomicsStore.from_files( ... store_dir="dataset/", ... bam_files=["atac.bam", "meth-rep1.bam"], ... methyldackel_files=["meth-rep1.bedGraph", "meth-rep2.bedGraph"], ... vcf_files=["snp.vcf.gz"], ... ) ms.modalities ['coverage', 'methylation', 'variants'] ms.coverage.sample_names ['atac', 'meth-rep1'] ms.methylation.to_xarray() ms.variants.extract_region("chr21:5000000-6000000")

Source code in quantnado/dataset/store_multiomics.py
def __init__(self, store_dir: Path | str) -> None:
    self.store_dir = Path(store_dir)

    self.coverage: BamStore | None = None
    self.methylation: MethylStore | None = None
    self.variants: VariantStore | None = None

    cov_path = self.store_dir / "coverage.zarr"
    meth_path = self.store_dir / "methylation.zarr"
    var_path = self.store_dir / "variants.zarr"

    if cov_path.exists():
        self.coverage = BamStore.open(cov_path)
    if meth_path.exists():
        self.methylation = MethylStore.open(meth_path)
    if var_path.exists():
        self.variants = VariantStore.open(var_path)

    if not self.modalities:
        logger.warning(
            f"No modality stores found in {self.store_dir}. "
            "Run from_files() to populate the store."
        )

modalities property

modalities: list[str]

List of available modalities in this store.

chromosomes property

chromosomes: list[str]

Sorted union of chromosome names across all modalities.

samples property

samples: dict[str, list[str]]

Sample names available per modality.

Returns:

Type Description
dict

Mapping modality name → list of sample names.

all_sample_names property

all_sample_names: list[str]

Ordered union of sample names across all modalities.

from_files classmethod

from_files(
    store_dir: Path | str,
    bam_files: list[str | Path] | None = None,
    methyldackel_files: list[str | Path] | None = None,
    cxreport_files: list[str | Path] | None = None,
    mc_files: list[str | Path] | None = None,
    hmc_files: list[str | Path] | None = None,
    vcf_files: list[str | Path] | None = None,
    chromsizes: str | Path | dict[str, int] | None = None,
    metadata: DataFrame
    | Path
    | str
    | list[Path | str]
    | None = None,
    *,
    bam_sample_names: list[str] | None = None,
    methyldackel_sample_names: list[str] | None = None,
    cxreport_sample_names: list[str] | None = None,
    mc_hmc_sample_names: list[str] | None = None,
    vcf_sample_names: list[str] | None = None,
    filter_chromosomes: bool = True,
    overwrite: bool = True,
    resume: bool = False,
    sample_column: str = "sample_id",
    chunk_len: int = DEFAULT_CHUNK_LEN,
    construction_compression: str = "default",
    local_staging: bool = False,
    staging_dir: "Path | str | None" = None,
    log_file: "Path | None" = None,
    max_workers: int = 1,
    test: bool = False,
    stranded: list[str] | dict[str, str] | None = None,
) -> "MultiomicsStore"

Create a MultiomicsStore from genomic data files.

At least one of bam_files, methyldackel_files, or vcf_files must be provided. Any omitted modality is simply absent from the store.

Parameters:

Name Type Description Default
store_dir Path or str

Output directory. Created if it does not exist.

required
bam_files list of Path

BAM files for per-base coverage storage.

None
methyldackel_files list of Path

MethylDackel CpG bedGraph files for methylation storage.

None
vcf_files list of Path

VCF.gz files (one per sample) for variant storage.

None
chromsizes str, Path, or dict

Chromosome sizes for the coverage store. Extracted from the first BAM file if not provided.

None
metadata DataFrame, Path, or str

Sample metadata CSV attached to all sub-stores. Each sub-store's sample names are used to subset the metadata automatically.

None
bam_sample_names list of str

Override sample names for BAM files (default: file stems).

None
methyldackel_sample_names list of str

Override sample names for bedGraph files (default: file stems). Useful when MethylDackel embeds genome/suffix in the filename, e.g. meth-rep1_hg38_CpG_inverted.bedGraph"meth-rep1".

None
vcf_sample_names list of str

Override sample names for VCF files (default: filename before first dot).

None
filter_chromosomes bool

Keep only canonical chromosomes (chr* without underscores).

True
overwrite bool

Overwrite existing sub-stores.

True
resume bool

Resume processing an existing sub-store, skipping completed samples.

False
sample_column str

Column in metadata matching sample names.

"sample_id"
chunk_len int

Zarr chunk size for the position dimension (coverage store only).

65536
construction_compression ('default', 'fast', 'none')

Build-time compression profile for the coverage store.

"default"
local_staging bool

Build the coverage store under local scratch storage before publishing.

False
staging_dir str or Path

Scratch directory for local staging. Defaults to system temp dir.

None
log_file Path

Path to write BAM processing logs.

None
max_workers int

The number of threads used to process chromosomes in parallel. Samples are processed sequentially to keep memory usage low.

1
test bool

Restrict coverage to chr21/chr22/chrY (for testing).

False
stranded list of str or dict

Strand-specific coverage configuration. Pass a list of sample names to use "U" (raw alignment orientation), or a dict mapping sample names to library types ("R", "F", or "U"). Use a dict for RNA-seq so read1/read2 orientation is taken into account.

  • "R" (ISR/dUTP/TruSeq Stranded): read1 aligns to reverse strand.
  • "F" (ISF/ligation): read1 aligns to forward strand.
  • "U": split purely by raw alignment orientation.

Example::

stranded={"rna-rep1": "R", "rna-rep2": "R"}
None
Source code in quantnado/dataset/store_multiomics.py
@classmethod
def from_files(
    cls,
    store_dir: Path | str,
    bam_files: list[str | Path] | None = None,
    methyldackel_files: list[str | Path] | None = None,
    cxreport_files: list[str | Path] | None = None,
    mc_files: list[str | Path] | None = None,
    hmc_files: list[str | Path] | None = None,
    vcf_files: list[str | Path] | None = None,
    chromsizes: str | Path | dict[str, int] | None = None,
    metadata: pd.DataFrame | Path | str | list[Path | str] | None = None,
    *,
    bam_sample_names: list[str] | None = None,
    methyldackel_sample_names: list[str] | None = None,
    cxreport_sample_names: list[str] | None = None,
    mc_hmc_sample_names: list[str] | None = None,
    vcf_sample_names: list[str] | None = None,
    filter_chromosomes: bool = True,
    overwrite: bool = True,
    resume: bool = False,
    sample_column: str = "sample_id",
    chunk_len: int = DEFAULT_CHUNK_LEN,
    construction_compression: str = "default",
    local_staging: bool = False,
    staging_dir: "Path | str | None" = None,
    log_file: "Path | None" = None,
    max_workers: int = 1,
    test: bool = False,
    stranded: list[str] | dict[str, str] | None = None,
) -> "MultiomicsStore":
    """
    Create a MultiomicsStore from genomic data files.

    At least one of ``bam_files``, ``methyldackel_files``, or ``vcf_files``
    must be provided. Any omitted modality is simply absent from the store.

    Parameters
    ----------
    store_dir : Path or str
        Output directory. Created if it does not exist.
    bam_files : list of Path, optional
        BAM files for per-base coverage storage.
    methyldackel_files : list of Path, optional
        MethylDackel CpG bedGraph files for methylation storage.
    vcf_files : list of Path, optional
        VCF.gz files (one per sample) for variant storage.
    chromsizes : str, Path, or dict, optional
        Chromosome sizes for the coverage store. Extracted from the first
        BAM file if not provided.
    metadata : DataFrame, Path, or str, optional
        Sample metadata CSV attached to all sub-stores. Each sub-store's
        sample names are used to subset the metadata automatically.
    bam_sample_names : list of str, optional
        Override sample names for BAM files (default: file stems).
    methyldackel_sample_names : list of str, optional
        Override sample names for bedGraph files (default: file stems).
        Useful when MethylDackel embeds genome/suffix in the filename,
        e.g. ``meth-rep1_hg38_CpG_inverted.bedGraph`` → ``"meth-rep1"``.
    vcf_sample_names : list of str, optional
        Override sample names for VCF files (default: filename before first dot).
    filter_chromosomes : bool, default True
        Keep only canonical chromosomes (``chr*`` without underscores).
    overwrite : bool, default True
        Overwrite existing sub-stores.
    resume : bool, default False
        Resume processing an existing sub-store, skipping completed samples.
    sample_column : str, default "sample_id"
        Column in ``metadata`` matching sample names.
    chunk_len : int, default 65536
        Zarr chunk size for the position dimension (coverage store only).
    construction_compression : {"default", "fast", "none"}, default "default"
        Build-time compression profile for the coverage store.
    local_staging : bool, default False
        Build the coverage store under local scratch storage before publishing.
    staging_dir : str or Path, optional
        Scratch directory for local staging. Defaults to system temp dir.
    log_file : Path, optional
        Path to write BAM processing logs.
    max_workers : int, default 1
        The number of threads used to process chromosomes in parallel.
        Samples are processed sequentially to keep memory usage low.
    test : bool, default False
        Restrict coverage to chr21/chr22/chrY (for testing).
    stranded : list of str or dict, optional
        Strand-specific coverage configuration.  Pass a **list** of sample names
        to use ``"U"`` (raw alignment orientation), or a **dict** mapping
        sample names to library types (``"R"``, ``"F"``, or ``"U"``).
        Use a dict for RNA-seq so read1/read2 orientation is taken into account.

        - ``"R"`` (ISR/dUTP/TruSeq Stranded): read1 aligns to reverse strand.
        - ``"F"`` (ISF/ligation): read1 aligns to forward strand.
        - ``"U"``: split purely by raw alignment orientation.

        Example::

            stranded={"rna-rep1": "R", "rna-rep2": "R"}
    """
    if not any([bam_files, vcf_files, methyldackel_files, cxreport_files, mc_files, hmc_files]):
        raise ValueError(
            "Provide at least one of bam_files, methyldackel_files, cxreport_files, "
            "mc_files/hmc_files, or vcf_files"
        )
    # cxreport_files cannot be mixed with bedgraph or split CXreport
    if cxreport_files and (methyldackel_files or mc_files or hmc_files):
        raise ValueError(
            "cxreport_files cannot be combined with methyldackel_files or mc_files/hmc_files"
        )

    store_dir = Path(store_dir)
    store_dir.mkdir(parents=True, exist_ok=True)

    if bam_files:
        logger.info(f"Building coverage store from {len(bam_files)} BAM file(s)...")
        BamStore.from_bam_files(
            bam_files=[str(f) for f in bam_files],
            store_path=store_dir / "coverage.zarr",
            bam_sample_names=bam_sample_names,
            chromsizes=chromsizes,
            metadata=metadata,
            filter_chromosomes=filter_chromosomes,
            overwrite=overwrite,
            resume=resume,
            sample_column=sample_column,
            chunk_len=chunk_len,
            construction_compression=construction_compression,
            local_staging=local_staging,
            staging_dir=staging_dir,
            log_file=log_file,
            max_workers=max_workers,
            test=test,
            stranded=stranded,
        )

    # Route methylation: mixed (bedgraph + cx), bedgraph-only, cxreport, or split cx
    if methyldackel_files and (mc_files or hmc_files):
        n_bg = len(methyldackel_files)
        n_cx = len(mc_files or hmc_files)
        logger.info(
            f"Building mixed methylation store from {n_bg} bedGraph + {n_cx} CXreport sample(s)..."
        )
        MethylStore.from_mixed_files(
            methyldackel_files=[str(f) for f in methyldackel_files],
            mc_files=[str(f) for f in mc_files] if mc_files else None,
            hmc_files=[str(f) for f in hmc_files] if hmc_files else None,
            store_path=store_dir / "methylation.zarr",
            methyldackel_sample_names=methyldackel_sample_names,
            mc_hmc_sample_names=mc_hmc_sample_names,
            metadata=metadata,
            filter_chromosomes=filter_chromosomes,
            overwrite=overwrite,
            resume=resume,
            sample_column=sample_column,
        )
    elif methyldackel_files:
        logger.info(f"Building methylation store from {len(methyldackel_files)} bedGraph file(s)...")
        MethylStore.from_bedgraph_files(
            methyldackel_files=[str(f) for f in methyldackel_files],
            store_path=store_dir / "methylation.zarr",
            sample_names=methyldackel_sample_names,
            metadata=metadata,
            filter_chromosomes=filter_chromosomes,
            overwrite=overwrite,
            resume=resume,
            sample_column=sample_column,
        )
    elif cxreport_files:
        logger.info(f"Building methylation store from {len(cxreport_files)} CXreport file(s)...")
        MethylStore.from_cxreport_files(
            cxreport_files=[str(f) for f in cxreport_files],
            store_path=store_dir / "methylation.zarr",
            sample_names=cxreport_sample_names,
            metadata=metadata,
            filter_chromosomes=filter_chromosomes,
            overwrite=overwrite,
            resume=resume,
            sample_column=sample_column,
        )
    elif mc_files or hmc_files:
        n_cx = len(mc_files or hmc_files)
        logger.info(
            f"Building methylation store from {n_cx} split CXreport sample(s)..."
        )
        MethylStore.from_split_cxreport_files(
            mc_files=[str(f) for f in mc_files] if mc_files else None,
            hmc_files=[str(f) for f in hmc_files] if hmc_files else None,
            store_path=store_dir / "methylation.zarr",
            sample_names=mc_hmc_sample_names,
            metadata=metadata,
            filter_chromosomes=filter_chromosomes,
            overwrite=overwrite,
            resume=resume,
            sample_column=sample_column,
        )

    if vcf_files:
        logger.info(f"Building variants store from {len(vcf_files)} VCF file(s)...")
        VariantStore.from_vcf_files(
            vcf_files=[str(f) for f in vcf_files],
            store_path=store_dir / "variants.zarr",
            sample_names=vcf_sample_names,
            metadata=metadata,
            filter_chromosomes=filter_chromosomes,
            overwrite=overwrite,
            resume=resume,
            sample_column=sample_column,
        )

    return cls(store_dir)

open classmethod

open(store_dir: str | Path) -> 'MultiomicsStore'

Open an existing MultiomicsStore directory.

Source code in quantnado/dataset/store_multiomics.py
@classmethod
def open(cls, store_dir: str | Path) -> "MultiomicsStore":
    """Open an existing MultiomicsStore directory."""
    store_dir = Path(store_dir)
    if not store_dir.exists():
        raise FileNotFoundError(f"Store directory does not exist: {store_dir}")
    return cls(store_dir)

set_metadata

set_metadata(
    metadata: DataFrame, sample_column: str = "sample_id"
) -> None

Attach metadata to all sub-stores.

Each store is updated using the subset of metadata that matches its own sample names — samples not present in that store are ignored.

Source code in quantnado/dataset/store_multiomics.py
def set_metadata(
    self,
    metadata: pd.DataFrame,
    sample_column: str = "sample_id",
) -> None:
    """
    Attach metadata to all sub-stores.

    Each store is updated using the subset of ``metadata`` that matches its
    own sample names — samples not present in that store are ignored.
    """
    for store in self._active_stores():
        try:
            store.set_metadata(metadata, sample_column=sample_column)
        except Exception as e:
            logger.warning(f"Could not set metadata on {type(store).__name__}: {e}")

get_metadata

get_metadata() -> pd.DataFrame

Combined metadata across all modalities.

Returns a DataFrame indexed by sample_id with one row per unique sample. A modalities column lists which modalities each sample appears in.

Source code in quantnado/dataset/store_multiomics.py
def get_metadata(self) -> pd.DataFrame:
    """
    Combined metadata across all modalities.

    Returns a DataFrame indexed by ``sample_id`` with one row per unique
    sample. A ``modalities`` column lists which modalities each sample
    appears in.
    """
    frames: dict[str, pd.DataFrame] = {}
    for modality, store in [
        ("coverage", self.coverage),
        ("methylation", self.methylation),
        ("variants", self.variants),
    ]:
        if store is None:
            continue
        df = store.get_metadata().copy()
        df["modality"] = modality
        frames[modality] = df

    if not frames:
        return pd.DataFrame()

    combined = pd.concat(frames.values(), ignore_index=False)

    # Build a modalities column showing which are available per sample.
    # For coverage, use "stranded_coverage" for samples where stranded arrays exist.
    sample_modalities: dict[str, list[str]] = {}
    for modality, df in frames.items():
        for sid in df.index:
            if (
                modality == "coverage"
                and self.coverage is not None
                and bool(self.coverage._strandedness_map.get(sid))
            ):
                label = "stranded_coverage"
            else:
                label = modality
            sample_modalities.setdefault(str(sid), []).append(label)

    # Deduplicate: keep first occurrence per sample_id
    combined = combined[~combined.index.duplicated(keep="first")].drop(
        columns=["modality"], errors="ignore"
    )
    combined["modalities"] = combined.index.map(
        lambda s: ", ".join(sample_modalities.get(str(s), []))
    )
    return combined