diff --git a/README.md b/README.md index 0b70a39d098..f2bff404303 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ 🤗 Datasets is a lightweight library providing **two** main features: - **one-line dataloaders for many public datasets**: one-liners to download and pre-process any of the ![number of datasets](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/datasets&color=brightgreen) major public datasets (image datasets, audio datasets, text datasets in 467 languages and dialects, etc.) provided on the [HuggingFace Datasets Hub](https://huggingface.co/datasets). With a simple command like `squad_dataset = load_dataset("rajpurkar/squad")`, get any of these datasets ready to use in a dataloader for training/evaluating a ML model (Numpy/Pandas/PyTorch/TensorFlow/JAX), -- **efficient data pre-processing**: simple, fast and reproducible data pre-processing for the public datasets as well as your own local datasets in CSV, JSON, text, PNG, JPEG, WAV, MP3, Parquet, HDF5, etc. With simple commands like `processed_dataset = dataset.map(process_example)`, efficiently prepare the dataset for inspection and ML model evaluation and training. +- **efficient data pre-processing**: simple, fast and reproducible data pre-processing for the public datasets as well as your own local datasets in CSV, JSON, text, PNG, JPEG, WAV, MP3, Parquet, HDF5, FASTA, etc. With simple commands like `processed_dataset = dataset.map(process_example)`, efficiently prepare the dataset for inspection and ML model evaluation and training. [🎓 **Documentation**](https://huggingface.co/docs/datasets/) [🔎 **Find a dataset in the Hub**](https://huggingface.co/datasets) [🌟 **Share a dataset on the Hub**](https://huggingface.co/docs/datasets/share) diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 74e3a8e383d..c7a14a71f4c 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -180,6 +180,15 @@ For now only the Arrow streaming format is supported. The Arrow IPC file format Note that the HDF5 loader assumes that the file has "tabular" structure, i.e. that all datasets in the file have (the same number of) rows on their first dimension. +## FASTA files + +[FASTA](https://www.ncbi.nlm.nih.gov/genbank/fastaformat/) files are commonly used genomic, protein, and nucleotide sequencing data. + +```py +>>> from datasets import load_dataset +>>> dataset = load_dataset("fasta", data_files="data.fasta") +``` + ### SQL Read database contents with [`~datasets.Dataset.from_sql`] by specifying the URI to connect to your database. You can read both table names and queries: diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx index 4792d1b88f7..fab23c12aae 100644 --- a/docs/source/package_reference/loading_methods.mdx +++ b/docs/source/package_reference/loading_methods.mdx @@ -97,6 +97,13 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t") [[autodoc]] datasets.packaged_modules.hdf5.HDF5 +### FASTA + +[[autodoc]] datasets.packaged_modules.fasta.FASTAConfig + +[[autodoc]] datasets.packaged_modules.fasta.FASTA + + ### Pdf [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolderConfig diff --git a/setup.py b/setup.py index 2f626763113..3feac309361 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ "elasticsearch>=7.17.12,<8.0.0", # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch(); 7.9.1 has legacy numpy.float_ which was fixed in https://github.com/elastic/elasticsearch-py/pull/2551. "faiss-cpu>=1.8.0.post1", # Pins numpy < 2 "h5py", + "biopython", "jax>=0.3.14; sys_platform != 'win32'", "jaxlib>=0.3.14; sys_platform != 'win32'", "lz4; python_version < '3.14'", # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 9d076df44b7..57b32af8377 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -8,6 +8,7 @@ from .audiofolder import audiofolder from .cache import cache from .csv import csv +from .fasta import fasta from .hdf5 import hdf5 from .imagefolder import imagefolder from .json import json @@ -51,6 +52,7 @@ def _hash_python_lines(lines: list[str]) -> str: "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), + "fasta": (fasta.__name__, _hash_python_lines(inspect.getsource(fasta).splitlines())), } # get importable module names and hash for caching @@ -82,6 +84,11 @@ def _hash_python_lines(lines: list[str]) -> str: ".xml": ("xml", {}), ".hdf5": ("hdf5", {}), ".h5": ("hdf5", {}), + ".fa": ("fasta", {}), + ".fasta": ("fasta", {}), + ".fna": ("fasta", {}), + ".ffn": ("fasta", {}), + ".frn": ("fasta", {}), } _EXTENSION_TO_MODULE.update({ext: ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) diff --git a/src/datasets/packaged_modules/fasta/__init__.py b/src/datasets/packaged_modules/fasta/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/fasta/fasta.py b/src/datasets/packaged_modules/fasta/fasta.py new file mode 100644 index 00000000000..e3797db5062 --- /dev/null +++ b/src/datasets/packaged_modules/fasta/fasta.py @@ -0,0 +1,136 @@ +import itertools +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Iterable, Optional +from typing import List as ListT + +import pyarrow as pa +from Bio import SeqIO + +import datasets +from datasets.features import Value +from datasets.table import table_cast + + +logger = datasets.utils.logging.get_logger(__name__) + +if TYPE_CHECKING: + from Bio import SeqIO + +# Common FASTA extensions; .gz will be handled by dl_manager.extract_on_the_fly +EXTENSIONS = [".fa", ".fasta", ".fna", ".ffn", ".faa", ".frn", ".fa.gz", ".fasta.gz"] + + +@dataclass +class FASTAConfig(datasets.BuilderConfig): + """BuilderConfig for FASTA.""" + + batch_size: Optional[int] = None + columns: Optional[ListT[str]] = None # subset of ["id", "description", "sequence"] + features: Optional[datasets.Features] = None + + def __post_init__(self): + super().__post_init__() + + +class FASTA(datasets.GeneratorBasedBuilder): + """GeneratorBasedBuilder that converts FASTA files to Arrow tables.""" + + BUILDER_CONFIG_CLASS = FASTAConfig + + def _info(self): + if ( + self.config.columns is not None + and self.config.features is not None + and set(self.config.columns) != set(self.config.features) + ): + raise ValueError( + "The columns and features argument must contain the same columns, but got " + f"{self.config.columns} and {self.config.features}", + ) + # Default features if not provided + if self.config.features is None: + self.config.features = datasets.Features( + {"id": Value("string"), "description": Value("string"), "sequence": Value("string")} + ) + return datasets.DatasetInfo(features=self.config.features) + + def _split_generators(self, dl_manager): + if not self.config.data_files: + raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") + dl_manager.download_config.extract_on_the_fly = True + data_files = dl_manager.download_and_extract(self.config.data_files) + + splits = [] + for split_name, files in data_files.items(): + if isinstance(files, str): + files = [files] + # Expand dirs/globs into concrete file iterables + files = [dl_manager.iter_files(file) for file in files] + + # Optionally narrow features to requested columns + if self.config.columns is not None and set(self.config.columns) != set(self.info.features): + self.info.features = datasets.Features( + {col: feat for col, feat in self.info.features.items() if col in self.config.columns} + ) + + splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) + + return splits + + def _cast_table(self, pa_table: pa.Table) -> pa.Table: + if self.info.features is not None: + pa_table = table_cast(pa_table, self.info.features.arrow_schema) + return pa_table + + def _generate_tables(self, files): + # files is an iterable of iterables (one per user provided path) + effective_cols = list(self.info.features.keys()) + batch_size_cfg = self.config.batch_size or self._writer_batch_size or 10_000 + + for file_idx, file in enumerate(itertools.chain.from_iterable(files)): + # Stream-parse and yield Arrow tables + try: + batch = {col: [] for col in effective_cols} + row_count = 0 + for rec in _iter_fasta_records(file): + row = { + "id": rec["id"], + "description": rec["description"], + "sequence": rec["sequence"], + } + for col in effective_cols: + batch[col].append(row[col]) + row_count += 1 + + if row_count % batch_size_cfg == 0: + pa_table = pa.Table.from_pydict(batch) + yield f"{file_idx}_{row_count - batch_size_cfg}", self._cast_table(pa_table) + batch = {col: [] for col in effective_cols} + + # Flush tail + if batch and any(len(v) for v in batch.values()): + start = row_count - len(next(iter(batch.values()))) if row_count else 0 + pa_table = pa.Table.from_pydict(batch) + yield f"{file_idx}_{start}", self._cast_table(pa_table) + + except ValueError as e: + logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") + raise + + +# ┌─────────────┐ +# │ FASTA I/O │ +# └─────────────┘ + + +def _iter_fasta_records(path: str) -> Iterable[Dict[str, str]]: + """ + Streaming FASTA parser that yields dicts with keys: id, description, sequence. + - Supports regular files and fsspec paths (including gzip://) + - Uses xopen to handle compressed files and streaming paths + """ + # Use xopen to handle fsspec paths (e.g., gzip://file::path.gz) and regular paths + # Open in text mode for BioPython's SeqIO.parse + with open(path, "r", encoding="utf-8") as f: + for r in SeqIO.parse(f, "fasta"): + yield {"id": r.id, "description": r.description, "sequence": str(r.seq)} diff --git a/tests/packaged_modules/test_fasta.py b/tests/packaged_modules/test_fasta.py new file mode 100644 index 00000000000..38612f4f206 --- /dev/null +++ b/tests/packaged_modules/test_fasta.py @@ -0,0 +1,556 @@ +import gzip +from textwrap import dedent + +import pytest + +from datasets import Features, Value +from datasets.builder import InvalidConfigName +from datasets.data_files import DataFilesDict, DataFilesList +from datasets.download.streaming_download_manager import StreamingDownloadManager +from datasets.packaged_modules.fasta.fasta import FASTA, FASTAConfig + + +# ┌─────────────────────────┐ +# │ Fixtures: FASTA files │ +# └─────────────────────────┘ + + +@pytest.fixture +def fasta_basic(tmp_path): + p = tmp_path / "basic.fasta" + # Put the header on the same line as '>' + p.write_text(">seq1 description here\nATGCATGC\nATGC\n>seq2 another desc\nGGGTTT\n>seq3\nAAAA\nTTTT\nCCCC\nGGGG\n") + return str(p) + + +@pytest.fixture +def fasta_with_whitespace(tmp_path): + p = tmp_path / "whitespace.fasta" + # Headers on the same line; sequences contain spaces/blank lines intentionally + p.write_text(">id1 some desc\nATG C A T GC\n\n>id2 desc with spaces\nG G G T T T \n>id3\nA T G C\n") + return str(p) + + +@pytest.fixture +def fasta_empty(tmp_path): + p = tmp_path / "empty.fasta" + p.write_text("") # no records + return str(p) + + +@pytest.fixture +def fasta_multi(tmp_path): + p1 = tmp_path / "file1.fasta" + p2 = tmp_path / "file2.fasta" + p1.write_text(">a\nAAA\n>b\nBBBB\n") + p2.write_text(">c\nC\n>d desc\nDDD\n") + return str(p1), str(p2) + + +@pytest.fixture +def fasta_gz(tmp_path): + p = tmp_path / "gz.fasta.gz" + content = ">gz1 first\nATATAT\n>gz2\nGCGC\n" + with gzip.open(p, "wb") as f: + f.write(content.encode("utf-8")) + return str(p) + + +# ┌─────────────────────────┐ +# │ Fixtures: FASTQ files │ +# └─────────────────────────┘ + + +@pytest.fixture +def fastq_basic(tmp_path): + p = tmp_path / "basic.fastq" + content = dedent("""\ + @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 + GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC + +SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 + IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC + @SRR001666.2 071112_SLXA-EAS1_s_7:5:1:801:338 length=36 + GTTCAGGGATACGACGTTTGTATTTTAAGAATCTGA + +SRR001666.2 071112_SLXA-EAS1_s_7:5:1:801:338 length=36 + IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII6IBI + """) + p.write_text(content) + return str(p) + + +@pytest.fixture +def fastq_multiline(tmp_path): + p = tmp_path / "multiline.fastq" + # FASTQ with multi-line sequences and quality scores + content = dedent("""\ + @read1 + GATTTGGGGTTCAAAGCAGTATCGATCAAATAGT + AAATCCATTTGTTCAACTCACAGTTT + + + !''*((((***+))%%%++)(%%%%).1***-+*' + '))**55CCF>>>>>>CCCCCCC65 + @read2 + ACGT + ACGT + + + IIII + IIII + """) + p.write_text(content) + return str(p) + + +@pytest.fixture +def fastq_empty(tmp_path): + p = tmp_path / "empty.fastq" + p.write_text("") + return str(p) + + +@pytest.fixture +def fastq_multi(tmp_path): + p1 = tmp_path / "file1.fastq" + p2 = tmp_path / "file2.fastq" + p1.write_text("@read1\nATGC\n+\nIIII\n@read2\nGGGG\n+\n!!!!\n") + p2.write_text("@read3\nAAAA\n+\nHHHH\n@read4\nTTTT\n+\n####\n") + return str(p1), str(p2) + + +@pytest.fixture +def fastq_gz(tmp_path): + p = tmp_path / "compressed.fastq.gz" + content = "@gz_read1\nATGCATGC\n+\nIIIIIIII\n@gz_read2\nGGGGTTTT\n+\nHHHHHHHH\n" + with gzip.open(p, "wb") as f: + f.write(content.encode("utf-8")) + return str(p) + + +# ┌──────────────────────┐ +# │ Config validation │ +# └──────────────────────┘ + + +def test_config_raises_when_invalid_name(): + with pytest.raises(InvalidConfigName, match="Bad characters"): + _ = FASTAConfig(name="bad*name") + + +@pytest.mark.parametrize("data_files", ["str_path", ["str_path"], DataFilesList(["str_path"], [()])]) +def test_config_raises_when_invalid_data_files(data_files): + with pytest.raises(ValueError, match="Expected a DataFilesDict"): + _ = FASTAConfig(name="ok", data_files=data_files) + + +# ┌──────────────────────────────┐ +# │ Basic functionality & schema │ +# └──────────────────────────────┘ + + +def test_fasta_basic_functionality(fasta_basic): + fasta = FASTA() + generator = fasta._generate_examples([[fasta_basic]]) + examples = list(generator) + assert len(examples) >= 1 + + # Collect all rows + all_rows = [example for _, example in examples] + + # Check columns + assert {"id", "description", "sequence"} <= set(all_rows[0].keys()) + + # Order should match stream order: seq1, seq2, seq3 + assert all_rows[0]["id"] == "seq1" + assert all_rows[0]["description"] == "seq1 description here" + assert all_rows[0]["sequence"] == "ATGCATGCATGC" # concatenated + + assert all_rows[1]["id"] == "seq2" + assert all_rows[1]["description"] == "seq2 another desc" + assert all_rows[1]["sequence"] == "GGGTTT" + + assert all_rows[2]["id"] == "seq3" + assert all_rows[2]["description"] == "seq3" + assert all_rows[2]["sequence"].lower() == "aaaattttccccgggg" + + +def test_fasta_whitespace_and_multiline(fasta_with_whitespace): + fasta = FASTA() + generator = fasta._generate_examples([[fasta_with_whitespace]]) + examples = list(generator) + + # Collect all rows + rows = [example for _, example in examples] + + assert rows[0]["id"] == "id1" + assert rows[0]["sequence"] == "ATGCATGC" # spaces & blank lines stripped + + assert rows[1]["id"] == "id2" + assert rows[1]["description"].startswith("id2") + assert rows[1]["sequence"] == "GGGTTT" + + assert rows[2]["id"] == "id3" + assert rows[2]["sequence"] == "ATGC" + + +# ┌───────────────┐ +# │ Batching │ +# └───────────────┘ + + +def test_fasta_batch_processing(fasta_basic): + config = FASTAConfig(batch_size=2) + fasta = FASTA() + fasta.config = config + + generator = fasta._generate_examples([[fasta_basic]]) + examples = list(generator) + + # 3 records in the file (batch_size doesn't affect _generate_examples) + assert len(examples) == 3 + + +# ┌───────────────────┐ +# │ Column filtering │ +# └───────────────────┘ + + +def test_fasta_column_filtering(fasta_basic): + config = FASTAConfig(columns=["id", "sequence"]) + + fasta = FASTA() + fasta.config = config + # Call _info to initialize features (they're already set correctly in config) + info = fasta._info() + fasta.info.features = Features({col: feat for col, feat in info.features.items() if col in config.columns}) + + generator = fasta._generate_examples([[fasta_basic]]) + examples = list(generator) + + # Ensure only selected columns appear + for _, example in examples: + assert set(example.keys()) == {"id", "sequence"} + assert isinstance(example["id"], str) + assert isinstance(example["sequence"], str) + + +def test_fasta_columns_features_mismatch(): + features = Features({"id": Value("string"), "sequence": Value("string")}) + config = FASTAConfig( + name="t", + columns=["id", "description"], # mismatch vs features + features=features, + ) + fasta = FASTA() + fasta.config = config + with pytest.raises(ValueError, match="must contain the same columns"): + fasta._info() + + +# ┌───────────────────────┐ +# │ Features & casting │ +# └───────────────────────┘ + + +def test_fasta_default_features(fasta_basic): + fasta = FASTA() + info = fasta._info() + assert set(info.features.keys()) == {"id", "description", "sequence"} + + +def test_fasta_feature_specification_casting(fasta_basic): + features = Features({"id": Value("string"), "description": Value("string"), "sequence": Value("string")}) + config = FASTAConfig(features=features) + fasta = FASTA() + fasta.config = config + + examples = list(fasta._generate_examples([[fasta_basic]])) + # Check that examples have the correct columns + _, example = examples[0] + for col in features: + assert col in example + assert isinstance(example[col], str) + + +# ┌───────────────────────────────┐ +# │ Empty files & warnings │ +# └───────────────────────────────┘ + + +def test_fasta_empty_file_warning(fasta_empty, caplog): + fasta = FASTA() + examples = list(fasta._generate_examples([[fasta_empty]])) + assert len(examples) == 0 + # A warning may be logged by your builder; this just asserts "no examples" behavior. + + +# ┌───────────────────────────────┐ +# │ Multiple files & splits │ +# └───────────────────────────────┘ + + +def test_fasta_multiple_files(fasta_multi): + f1, f2 = fasta_multi + fasta = FASTA() + examples = list(fasta._generate_examples([[f1, f2]])) + # Expect records from both files in order + ids = [example["id"] for _, example in examples] + assert len(examples) == 4 + assert ids == ["a", "b", "c", "d"] + + +def test_fasta_gz_via_dl_manager(fasta_gz, tmp_path): + # Test that gzipped FASTA files can be read via StreamingDownloadManager. + # This validates that the FASTA implementation properly handles compressed files + data_files = DataFilesDict({"train": [fasta_gz]}) + config = FASTAConfig(data_files=data_files) + fasta = FASTA() + fasta.config = config + + dlm = StreamingDownloadManager() + splits = fasta._split_generators(dlm) + assert len(splits) == 1 + # Generate examples using files from dl_manager (ensures .gz is extracted on the fly) + examples = list(fasta._generate_examples(splits[0].gen_kwargs["files"])) + assert len(examples) >= 1 + + # Collect all examples + rows = [example for _, example in examples] + + assert len(rows) == 2 + assert rows[0]["id"] == "gz1" + assert rows[0]["sequence"] == "ATATAT" + assert rows[1]["id"] == "gz2" + assert rows[1]["sequence"] == "GCGC" + + +# ┌───────────────────────────────┐ +# │ Integration: load_dataset │ +# └───────────────────────────────┘ + + +def test_fasta_load_dataset_like_usage(fasta_basic, tmp_path, monkeypatch): + # This test demonstrates that the packaged module can be consumed as a HF dataset script. + # If your builder is shipped as a packaged module, adjust `path` accordingly or skip this. + # Here we call the builder directly to avoid I/O complexity. + config = FASTAConfig() + fasta = FASTA() + fasta.config = config + examples = list(fasta._generate_examples([[fasta_basic]])) + assert len(examples) >= 1 + + # Verify that examples have the expected structure + _, example = examples[0] + assert "id" in example + assert "description" in example + assert "sequence" in example + + +# ┌───────────────────────────────┐ +# │ Edge cases │ +# └───────────────────────────────┘ + + +def test_fasta_handles_no_trailing_newline(tmp_path): + p = tmp_path / "no_newline.fasta" + p.write_text(">x\nATGC") # no trailing newline + fasta = FASTA() + examples = list(fasta._generate_examples([[str(p)]])) + rows = [example for _, example in examples] + assert rows == [{"id": "x", "description": "x", "sequence": "ATGC"}] + + +def test_fasta_single_record(tmp_path): + p = tmp_path / "single.fasta" + p.write_text( + dedent("""> + only + A + """) + .strip() + .replace("\n ", "\n") + ) + fasta = FASTA() + examples = list(fasta._generate_examples([[str(p)]])) + assert len(examples) == 1 + + +# ┌───────────────────────────────────┐ +# │ FASTQ: Basic functionality │ +# └───────────────────────────────────┘ + + +def test_fastq_basic_functionality(fastq_basic): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + assert len(examples) >= 1 + + # Collect all rows + all_rows = [example for _, example in examples] + + assert set(all_rows[0].keys()) == {"id", "description", "sequence", "quality"} + + # Verify first record + assert all_rows[0]["id"] == "SRR001666.1" + assert all_rows[0]["description"] == "SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36" + assert all_rows[0]["sequence"] == "GGGTGATGGCCGCTGCCGATGGCGTCAAATCCCACC" + assert all_rows[0]["quality"] == "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9IC" + + # Verify second record + assert all_rows[1]["id"] == "SRR001666.2" + assert all_rows[1]["description"] == "SRR001666.2 071112_SLXA-EAS1_s_7:5:1:801:338 length=36" + assert all_rows[1]["sequence"] == "GTTCAGGGATACGACGTTTGTATTTTAAGAATCTGA" + assert all_rows[1]["quality"] == "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII6IBI" + + +def test_fastq_multiline_sequences(fastq_multiline): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + generator = fasta._generate_examples([[fastq_multiline]]) + examples = list(generator) + + # Collect all rows + rows = [example for _, example in examples] + + # First record - multi-line sequence and quality should be concatenated + assert rows[0]["id"] == "read1" + assert rows[0]["sequence"] == "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT" + assert rows[0]["quality"] == "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65" + + # Second record + assert rows[1]["id"] == "read2" + assert rows[1]["sequence"] == "ACGTACGT" + assert rows[1]["quality"] == "IIIIIIII" + + +def test_fastq_default_features(fastq_basic): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + # FASTQ should have id, description, sequence, and quality + assert set(fasta.info.features.keys()) == {"id", "description", "sequence", "quality"} + + +def test_fastq_column_filtering(fastq_basic): + config = FASTAConfig( + file_type="fastq", + columns=["id", "sequence", "quality"] + ) + fasta = FASTA() + fasta.config = config + # Call _info to initialize features (they're already set correctly in config) + info = fasta._info() + fasta.info.features = Features({col: feat for col, feat in info.features.items() if col in config.columns}) + + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + + # Ensure only selected columns appear + for _, example in examples: + assert set(example.keys()) == {"id", "sequence", "quality"} + assert isinstance(example["id"], str) + assert isinstance(example["sequence"], str) + assert isinstance(example["quality"], str) + + +def test_fastq_batch_processing(fastq_basic): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + + # 2 records in the file + assert len(examples) == 2 + + +def test_fastq_empty_file(fastq_empty): + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + examples = list(fasta._generate_examples([[fastq_empty]])) + assert len(examples) == 0 + + +def test_fastq_multiple_files(fastq_multi): + f1, f2 = fastq_multi + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + examples = list(fasta._generate_examples([[f1, f2]])) + + ids = [example["id"] for _, example in examples] + + assert len(examples) == 4 + assert ids == ["read1", "read2", "read3", "read4"] + + +def test_fastq_gz_via_dl_manager(fastq_gz, tmp_path): + # Test that gzipped FASTQ files can be read via StreamingDownloadManager + data_files = DataFilesDict({"train": [fastq_gz]}) + config = FASTAConfig(data_files=data_files, file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + + dlm = StreamingDownloadManager() + splits = fasta._split_generators(dlm) + assert len(splits) == 1 + + examples = list(fasta._generate_examples(splits[0].gen_kwargs["files"])) + assert len(examples) >= 1 + + # Collect all examples + rows = [example for _, example in examples] + + assert len(rows) == 2 + assert rows[0]["id"] == "gz_read1" + assert rows[0]["sequence"] == "ATGCATGC" + assert rows[0]["quality"] == "IIIIIIII" + assert rows[1]["id"] == "gz_read2" + assert rows[1]["sequence"] == "GGGGTTTT" + assert rows[1]["quality"] == "HHHHHHHH" + + +def test_fastq_quality_scores_preserved(fastq_basic): + # Verify that quality scores with special characters are preserved correctly + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + generator = fasta._generate_examples([[fastq_basic]]) + examples = list(generator) + + rows = [example for _, example in examples] + + # Check that quality characters are preserved (high quality 'I' and moderate quality digits) + assert "I" in rows[0]["quality"] + assert "9" in rows[0]["quality"] + assert "G" in rows[0]["quality"] + assert "C" in rows[0]["quality"] + assert "6" in rows[1]["quality"] + assert "B" in rows[1]["quality"] + + +def test_fastq_handles_no_trailing_newline(tmp_path): + p = tmp_path / "no_newline.fastq" + p.write_text("@read1\nATGC\n+\nIIII") # no trailing newline + config = FASTAConfig(file_type="fastq") + fasta = FASTA() + fasta.config = config + fasta.info = fasta._info() + examples = list(fasta._generate_examples([[str(p)]])) + rows = [example for _, example in examples] + assert len(rows) == 1 + assert rows[0]["id"] == "read1" + assert rows[0]["sequence"] == "ATGC" + assert rows[0]["quality"] == "IIII" \ No newline at end of file