Skip to content

Inputs API

This module handles the discovery and validation of input sequencing files.

seqnado.inputs.Metadata

Bases: BaseModel

Metadata for samples. Optional fields can be set to None.

set_mcc_defaults

set_mcc_defaults() -> Self

Set default consensus_group for MCC assay.

Source code in seqnado/inputs/core.py
@model_validator(mode='after')
def set_mcc_defaults(self) -> Self:
    """Set default consensus_group for MCC assay."""
    if self.assay == Assay.MCC and self.consensus_group is None:
        self.consensus_group = 'default'
    return self

seqnado.inputs.FastqCollection

Bases: BaseFastqCollection

Represents a collection of sequencing samples (FASTQ files) grouped into named sets, with optional per-sample metadata.

Attributes:

Name Type Description
fastq_sets list[FastqSet]

List of FastqSet objects (paired or single-end samples).

metadata list[Metadata]

List of Metadata objects corresponding one-to-one with fastq_sets.

sample_ids property

sample_ids: list[str]

Returns all sample IDs in the design.

sample_names property

sample_names: list[str]

Returns all sample names in the design.

fastq_paths property

fastq_paths: list[Path]

Flattens all R1/R2 file paths into a single list.

fastq_pairs property

fastq_pairs: dict[str, list[Path]]

Returns a dictionary mapping sample names to their FASTQ file paths.

validate_non_ip_assay classmethod

validate_non_ip_assay(v: Assay) -> Assay

Ensure the assay doesn't require IP (immunoprecipitation).

Source code in seqnado/inputs/fastq.py
@field_validator("assay")
@classmethod
def validate_non_ip_assay(cls, v: Assay) -> Assay:
    """Ensure the assay doesn't require IP (immunoprecipitation)."""
    if v in Assay.ip_assays():
        raise ValueError(
            f"Assay '{v.value}' requires IP and should use IPSampleCollection instead"
        )
    return v

query

query(sample_name: str) -> FastqSet

Retrieve the FastqSet by its sample name.

Raises:

Type Description
ValueError

If sample_name not found.

Source code in seqnado/inputs/fastq.py
def query(self, sample_name: str) -> FastqSet:
    """
    Retrieve the FastqSet by its sample name.

    Raises:
        ValueError: If sample_name not found.
    """
    try:
        return next(fs for fs in self.fastq_sets if fs.sample_id == sample_name)
    except StopIteration:
        raise ValueError(f"Sample '{sample_name}' not found in SampleCollection")

is_paired_end

is_paired_end(uid: str) -> bool

Check if the given sample ID is paired-end.

Source code in seqnado/inputs/fastq.py
def is_paired_end(self, uid: str) -> bool:
    """
    Check if the given sample ID is paired-end.
    """
    return self.to_dataframe().loc[uid, "r2"] is not None

from_fastq_files classmethod

from_fastq_files(
    assay: Assay,
    files: Iterable[str | Path],
    metadata: (
        Callable[[str], Metadata] | Metadata | None
    ) = None,
    **fastqset_kwargs: Any
) -> FastqCollection

Build a SampleCollection by scanning a list of FASTQ paths:

  1. Convert raw paths to FastqFile.
  2. Group by sample_base and sort by read_number.
  3. Create FastqSet (single- or paired-end) for each sample.
  4. Generate Metadata via metadata(sample_name), or default.

Parameters:

Name Type Description Default
files Iterable[str | Path]

Iterable of file paths (strings or Path).

required
metadata Callable[[str], Metadata] | Metadata | None
  • Callable(sample_name) → Metadata to customize per-sample metadata.
  • Single Metadata instance applied to all.
  • None → defaults to Metadata().
None
fastqset_kwargs Any

Extra fields forwarded to FastqSet constructor.

{}
Source code in seqnado/inputs/fastq.py
@classmethod
def from_fastq_files(
    cls,
    assay: Assay,
    files: Iterable[str | Path],
    metadata: Callable[[str], Metadata] | Metadata | None = None,
    **fastqset_kwargs: Any,
) -> FastqCollection:
    """
    Build a SampleCollection by scanning a list of FASTQ paths:

    1. Convert raw paths to FastqFile.
    2. Group by `sample_base` and sort by read_number.
    3. Create FastqSet (single- or paired-end) for each sample.
    4. Generate Metadata via `metadata(sample_name)`, or default.

    Args:
        files: Iterable of file paths (strings or Path).
        metadata:
            - Callable(sample_name) → Metadata to customize per-sample metadata.
            - Single Metadata instance applied to all.
            - None → defaults to Metadata().
        fastqset_kwargs: Extra fields forwarded to FastqSet constructor.
    """
    # Convert and sort
    fq_files = [FastqFile(path=Path(f)) for f in files]
    fq_files.sort(key=lambda x: (x.sample_base, x.read_number))

    # Group by sample_stem
    groups: dict[str, list[FastqFile]] = defaultdict(list)
    for fq in fq_files:
        groups[fq.sample_base].append(fq)

    _fastq_sets: list[FastqSet] = []
    _metadata: list[Metadata] = []
    for sample, fqs in groups.items():
        # Build FastqSet
        if len(fqs) == 1:
            fs = FastqSet(sample_id=sample, r1=fqs[0], **fastqset_kwargs)
        elif len(fqs) == 2:
            fs = FastqSet(sample_id=sample, r1=fqs[0], r2=fqs[1], **fastqset_kwargs)
        else:
            raise ValueError(
                f"Unexpected number of FASTQ files for '{sample}': {len(fqs)}"
            )
        _fastq_sets.append(fs)

        # Build Metadata using base class method
        _metadata.append(cls._build_metadata(sample, metadata, assay))

    return cls(assay=assay, fastq_sets=_fastq_sets, metadata=_metadata)

from_directory classmethod

from_directory(
    assay: Assay,
    directory: str | Path,
    glob_patterns: Iterable[str] = (
        "*.fq",
        "*.fq.gz",
        "*.fastq",
        "*.fastq.gz",
    ),
    metadata: (
        Callable[[str], Metadata] | Metadata | None
    ) = None,
    **kwargs: Any
) -> FastqCollection

Recursively scan a directory for FASTQ files and build a SampleCollection.

Parameters:

Name Type Description Default
directory str | Path

Root path to search.

required
glob_patterns Iterable[str]

Filename patterns to include.

('*.fq', '*.fq.gz', '*.fastq', '*.fastq.gz')
metadata Callable[[str], Metadata] | Metadata | None

Callable(sample_name) → Metadata or single Metadata instance.

None
**kwargs Any

Extra fields converted directly to a shared Metadata.

{}
Source code in seqnado/inputs/fastq.py
@classmethod
def from_directory(
    cls,
    assay: Assay,
    directory: str | Path,
    glob_patterns: Iterable[str] = ("*.fq", "*.fq.gz", "*.fastq", "*.fastq.gz"),
    metadata: Callable[[str], Metadata] | Metadata | None = None,
    **kwargs: Any,
) -> FastqCollection:
    """
    Recursively scan a directory for FASTQ files and build a SampleCollection.

    Args:
        directory: Root path to search.
        glob_patterns: Filename patterns to include.
        metadata: Callable(sample_name) → Metadata or single Metadata instance.
        **kwargs: Extra fields converted directly to a shared Metadata.
    """
    files = cls._discover_files(directory, glob_patterns)
    metadata = cls._prepare_metadata_for_directory(metadata, **kwargs)
    return cls.from_fastq_files(assay=assay, files=files, metadata=metadata)

to_dataframe

to_dataframe(validate: bool = True) -> pd.DataFrame

Export the design to a pandas DataFrame, validated by DataFrameDesign.

Columns: sample_name, r1, r2, plus all metadata fields.

Source code in seqnado/inputs/fastq.py
def to_dataframe(self, validate: bool = True) -> pd.DataFrame:
    """
    Export the design to a pandas DataFrame, validated by DataFrameDesign.

    Columns: sample_name, r1, r2, plus all metadata fields.
    """
    import pandas as pd

    rows: list[dict[str, Any]] = []

    if self.metadata:
        for fs, md in zip(self.fastq_sets, self.metadata):
            row: dict[str, Any] = {
                "sample_id": fs.sample_id,
                "r1": fs.r1.path,
                "r2": fs.r2.path if fs.r2 else None,
                "uid": f"{fs.sample_id}",
            }
            metadata_dict = md.model_dump(exclude_none=True)
            # Convert Assay enum to string value for schema validation
            if "assay" in metadata_dict and hasattr(metadata_dict["assay"], "value"):
                metadata_dict["assay"] = metadata_dict["assay"].value
            row.update(metadata_dict)
            rows.append(row)
    else:
        for fs in self.fastq_sets:
            row = {
                "sample_id": fs.sample_id,
                "r1": fs.r1.path,
                "r2": fs.r2.path if fs.r2 else None,
                "uid": f"{fs.sample_id}",
            }
            rows.append(row)

    if not rows:
        # Return empty DataFrame with expected columns
        df = pd.DataFrame(columns=["sample_id", "r1", "r2", "uid"]).set_index("uid")
    else:
        df = pd.DataFrame(rows).sort_values("sample_id").set_index("uid")

    # Define column order: critical columns first (assay, sample info, files), then other metadata
    core_cols = ["assay", "sample_id", "r1", "r2"]
    metadata_cols = [col for col in df.columns if col not in core_cols]
    ordered_cols = core_cols + sorted(metadata_cols)
    df = df[[col for col in ordered_cols if col in df.columns]]

    if validate:
        return DataFrame[DesignDataFrame](df)
    else:
        return df

from_dataframe classmethod

from_dataframe(
    assay: Assay,
    df: DataFrame,
    validate_deseq2: bool = False,
    assay_for_validation: Assay | None = None,
    **fastqset_kwargs: Any
) -> FastqCollection

Build a SampleCollection from a DataFrame, validated by DataFrameDesign.

Expects columns: sample_name, r1, r2, plus any metadata fields.

Parameters:

Name Type Description Default
assay Assay

The assay type

required
df DataFrame

DataFrame with sample metadata

required
validate_deseq2 bool

If True, require deseq2 field to be non-null (for RNA assays)

False
assay_for_validation Assay | None

Assay type to check in validation context

None
**fastqset_kwargs Any

Additional kwargs for FastqSet

{}
Source code in seqnado/inputs/fastq.py
@classmethod
def from_dataframe(
    cls, assay: Assay, df: pd.DataFrame, validate_deseq2: bool = False, assay_for_validation: Assay | None = None, **fastqset_kwargs: Any
) -> FastqCollection:
    """
    Build a SampleCollection from a DataFrame, validated by DataFrameDesign.

    Expects columns: sample_name, r1, r2, plus any metadata fields.

    Args:
        assay: The assay type
        df: DataFrame with sample metadata
        validate_deseq2: If True, require deseq2 field to be non-null (for RNA assays)
        assay_for_validation: Assay type to check in validation context
        **fastqset_kwargs: Additional kwargs for FastqSet
    """
    df = DesignDataFrame.validate(df)
    fastq_sets: list[FastqSet] = []
    metadata: list[Metadata] = []
    metadata_fields = set(Metadata.model_fields.keys())

    # Use provided assay_for_validation or fall back to assay
    validation_assay = assay_for_validation or assay

    for rec in df.to_dict(orient="records"):
        # Build FastqSet
        r2_path = rec.get("r2")
        fs = FastqSet(
            sample_id=rec["sample_id"],
            r1=FastqFile(path=rec["r1"]),
            r2=FastqFile(path=r2_path) if pd.notna(r2_path) else None,
            **fastqset_kwargs,
        )
        fastq_sets.append(fs)

        # Collect metadata with validation context
        meta_fields = {k: rec.get(k) for k in metadata_fields if k in rec}
        metadata.append(Metadata.model_validate(meta_fields, context={'validate_deseq2': validate_deseq2, 'assay': validation_assay}))

    return cls(assay=assay, fastq_sets=fastq_sets, metadata=metadata)

seqnado.inputs.BamCollection

Bases: BaseCollection

Collection of BAM files with optional per-sample metadata.

Provides convenience constructors analogous to SampleCollection but without paired-end logic.

from_dataframe classmethod

from_dataframe(
    assay: Assay,
    df: Any,
    validate_deseq2: bool = False,
    assay_for_validation: Assay | None = None,
    **kwargs: Any
) -> BamCollection

Build a BamCollection from a DataFrame.

Expects columns: sample_id, bam, plus any metadata fields.

Parameters:

Name Type Description Default
assay Assay

The assay type

required
df Any

DataFrame with sample metadata

required
validate_deseq2 bool

If True, require deseq2 field to be non-null (for RNA assays)

False
assay_for_validation Assay | None

Assay type to check in validation context

None
**kwargs Any

Additional kwargs

{}
Source code in seqnado/inputs/bam.py
@classmethod
def from_dataframe(
    cls, assay: Assay, df: Any, validate_deseq2: bool = False, assay_for_validation: Assay | None = None, **kwargs: Any
) -> BamCollection:
    """Build a BamCollection from a DataFrame.

    Expects columns: sample_id, bam, plus any metadata fields.

    Args:
        assay: The assay type
        df: DataFrame with sample metadata
        validate_deseq2: If True, require deseq2 field to be non-null (for RNA assays)
        assay_for_validation: Assay type to check in validation context
        **kwargs: Additional kwargs
    """
    import pandas as pd

    bam_files: list[BamFile] = []
    metadata: list[Metadata] = []
    metadata_fields = set(Metadata.model_fields.keys())

    # Use provided assay_for_validation or fall back to assay
    validation_assay = assay_for_validation or assay

    for rec in df.to_dict(orient="records"):
        # Build BamFile
        bam_files.append(BamFile(path=Path(rec["bam"])))

        # Collect metadata with validation context
        meta_fields = {k: rec.get(k) for k in metadata_fields if k in rec}
        metadata.append(Metadata.model_validate(meta_fields, context={'validate_deseq2': validate_deseq2, 'assay': validation_assay}))

    return cls(assay=assay, bam_files=bam_files, metadata=metadata)

← Back to API Overview