Skip to content

VariantStore

quantnado.dataset.store_variants.VariantStore

VariantStore(
    store_path: Path | str,
    sample: str,
    chromsizes: dict[str, int],
    *,
    chunk_len: int,
    compressors: list,
    overwrite: bool = True,
)

Per-sample Zarr store for VCF/SNP data.

Stores dense (1, chrom_len) arrays for GT, DP, AF, MQ. VCF positions (1-based) are mapped to 0-based array indices.

Use :meth:from_vcf to create or :meth:open to read.

Source code in quantnado/dataset/store_variants.py
def __init__(
    self,
    store_path: Path | str,
    sample: str,
    chromsizes: dict[str, int],
    *,
    chunk_len: int,
    compressors: list,
    overwrite: bool = True,
) -> None:
    self.store_path = _normalize_path(store_path)
    self.sample = sample
    self.chromsizes = chromsizes
    self.chromosomes = sorted(chromsizes.keys())
    self.chunk_len = chunk_len
    self.compressors = compressors

    if overwrite:
        _delete_path(self.store_path)

    self.store_path.parent.mkdir(parents=True, exist_ok=True)
    local_store = LocalStore(str(self.store_path))
    self.root = zarr.group(store=local_store, overwrite=True, zarr_format=3)
    self.meta = create_metadata_group(self.root, sample)
    self._init_arrays()
    self.root.attrs.update({
        "assay": "snp",
        "sample": sample,
        "chromsizes": chromsizes,
        "chunk_len": chunk_len,
    })

from_vcf classmethod

from_vcf(
    vcf_path: str | Path,
    store_path: Path | str,
    sample: str,
    chromsizes: str | Path | dict[str, int] | None = None,
    *,
    chunk_len: int | None = None,
    construction_compression: str = DEFAULT_CONSTRUCTION_COMPRESSION,
    overwrite: bool = True,
    filter_chromosomes: bool = True,
    test: bool = False,
    test_chromosomes: list[str]
    | tuple[str, ...]
    | None = None,
    log_file: Path | None = None,
) -> "VariantStore"

Create a per-sample VariantStore zarr from a single-sample VCF.

Parameters:

Name Type Description Default
vcf_path str | Path

Path to annotated VCF (.vcf or .vcf.gz).

required
store_path Path | str

Output .zarr directory.

required
sample str

Sample name.

required
chromsizes str | Path | dict[str, int] | None

Path to .chrom.sizes, dict, or None to infer from VCF ##contig headers.

None
Source code in quantnado/dataset/store_variants.py
@classmethod
def from_vcf(
    cls,
    vcf_path: str | Path,
    store_path: Path | str,
    sample: str,
    chromsizes: str | Path | dict[str, int] | None = None,
    *,
    chunk_len: int | None = None,
    construction_compression: str = DEFAULT_CONSTRUCTION_COMPRESSION,
    overwrite: bool = True,
    filter_chromosomes: bool = True,
    test: bool = False,
    test_chromosomes: list[str] | tuple[str, ...] | None = None,
    log_file: Path | None = None,
) -> "VariantStore":
    """Create a per-sample VariantStore zarr from a single-sample VCF.

    Parameters
    ----------
    vcf_path:
        Path to annotated VCF (.vcf or .vcf.gz).
    store_path:
        Output .zarr directory.
    sample:
        Sample name.
    chromsizes:
        Path to .chrom.sizes, dict, or None to infer from VCF ##contig headers.
    """
    if log_file is not None:
        from quantnado.utils import setup_logging
        setup_logging(Path(log_file), verbose=False)

    logger.info(f"Reading VCF: {vcf_path}")
    chrom_data, header_chromsizes = _read_vcf(
        vcf_path,
        filter_chromosomes=filter_chromosomes,
        test=test,
        test_chromosomes=test_chromosomes,
    )

    if chromsizes is None:
        if not header_chromsizes:
            raise ValueError(
                "No chromsizes provided and VCF ##contig headers are missing. "
                "Provide chromsizes explicitly."
            )
        chromsizes = header_chromsizes

    chromsizes_dict = _parse_chromsizes(
        chromsizes,
        filter_chromosomes=filter_chromosomes,
        test=test,
        test_chromosomes=test_chromosomes,
    )
    resolved_chunk_len = _resolve_chunk_len(chromsizes_dict, Path(store_path), chunk_len)
    compressors = _resolve_compressors(construction_compression)

    store = cls(
        store_path=store_path,
        sample=sample,
        chromsizes=chromsizes_dict,
        chunk_len=resolved_chunk_len,
        compressors=compressors,
        overwrite=overwrite,
    )

    store._write_variants(chrom_data)
    n_variants = sum(len(d["pos"]) for d in chrom_data.values())
    store._finalise(n_variants)

    return store