diff --git a/q2_types/_util.py b/q2_types/_util.py index 4df29ebf..f7e54f80 100644 --- a/q2_types/_util.py +++ b/q2_types/_util.py @@ -7,16 +7,20 @@ # ---------------------------------------------------------------------------- import gzip import itertools +import os import re import warnings from collections import defaultdict -from typing import List +from typing import List, TypeVar import skbio import pandas as pd import qiime2.plugin.model as model from qiime2.plugin import ValidationError +from qiime2.util import duplicate + +DirFmt = TypeVar("DirFmt", bound=model.DirectoryFormat) def read_from_fasta(path, constructor=skbio.DNA, lowercase=False): @@ -233,3 +237,47 @@ class does not have a suffixes attribute, then the ID is defined to else path.absolute() ) return str(processed_path), _id + + +def _duplicate_with_warning(src, dst): + try: + duplicate(src, dst) + except FileExistsError: + warnings.warn( + f"Skipping {src}. File already " + f"exists in the destination directory." + ) + + +def _collate_helper(dir_fmts: List[DirFmt]) -> DirFmt: + """ + Iterates through a list of directory formats, merging their contents + into a single directory. Can be used with per sample directories and + without. Handles duplicate files by issuing warnings when conflicts occur. + + Parameters: + dir_fmts (iterable): + A List of directory format objects to be collated. + + Returns: + object: + The updated `collated` directory format object containing all + merged files and subdirectories. + """ + # Initialize the collated directory format with the same class as inputs + collated = dir_fmts[0].__class__() + + for dir_fmt in dir_fmts: + for item in dir_fmt.path.iterdir(): + target = collated.path / item.name + # Per sample directories + if item.is_dir(): + target.mkdir(exist_ok=True) + for file in item.iterdir(): + _duplicate_with_warning(file, target / file.name) + # Non per sample directories + else: + _duplicate_with_warning( + item, collated.path / os.path.basename(item) + ) + return collated diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index e9a54c9e..0cfd53b2 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -17,7 +17,8 @@ GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG ) from ._methods import collate_orthologs, partition_orthologs, \ - collate_ortholog_annotations, collate_genomes + collate_ortholog_annotations, collate_genomes, collate_loci, \ + collate_genes, collate_proteins __all__ = [ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', @@ -25,6 +26,6 @@ 'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs', 'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence', 'OrthologAnnotationDirFmt', 'NOG', - 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations", - 'collate_genomes' + 'collate_orthologs', 'partition_orthologs', 'collate_ortholog_annotations', + 'collate_genomes', 'collate_loci', 'collate_genes', "collate_proteins" ] diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 23dff04b..82be1c01 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -16,35 +16,35 @@ import skbio from qiime2.util import duplicate +from q2_types._util import _collate_helper from q2_types.feature_data import DNAIterator, DNAFASTAFormat -from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt, - LociDirectoryFormat, - GenomeSequencesDirectoryFormat) +from q2_types.genome_data import ( + SeedOrthologDirFmt, OrthologAnnotationDirFmt, LociDirectoryFormat, + GenomeSequencesDirectoryFormat, GenesDirectoryFormat, + ProteinsDirectoryFormat +) def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: - """ - Collate the individual loci directories from the partitions. - Parameters: - - loci: A list of LociDirectoryFormat containing the gff files. - Returns: - - collated_loci: A LociDirectoryFormat object containing the - collated gff files. - """ - collated_loci = LociDirectoryFormat() - for loci_dir in loci: - for fp in loci_dir.path.iterdir(): - try: - duplicate( - fp, - collated_loci.path / os.path.basename(fp) - ) - except FileExistsError: - warnings.warn( - f"Skipping {fp}. File already exists " - f"in the destination directory." - ) - return collated_loci + return _collate_helper(dir_fmts=loci) + + +def collate_ortholog_annotations( + ortholog_annotations: OrthologAnnotationDirFmt +) -> OrthologAnnotationDirFmt: + return _collate_helper(dir_fmts=ortholog_annotations) + + +def collate_genes( + genes: GenesDirectoryFormat +) -> GenesDirectoryFormat: + return _collate_helper(dir_fmts=genes) + + +def collate_proteins( + proteins: ProteinsDirectoryFormat +) -> ProteinsDirectoryFormat: + return _collate_helper(dir_fmts=proteins) def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: @@ -106,20 +106,6 @@ def partition_orthologs( return partitioned_orthologs -def collate_ortholog_annotations( - ortholog_annotations: OrthologAnnotationDirFmt -) -> OrthologAnnotationDirFmt: - # Init output - collated_annotations = OrthologAnnotationDirFmt() - - # Copy annotations into output - for anno in ortholog_annotations: - for fp in anno.path.iterdir(): - duplicate(fp, collated_annotations.path / fp.name) - - return collated_annotations - - def collate_genomes( genomes: Union[DNAFASTAFormat, GenomeSequencesDirectoryFormat], on_duplicates: str = "warn", diff --git a/q2_types/genome_data/tests/data/partitioned_genes/1/sample1/genes1.fa b/q2_types/genome_data/tests/data/partitioned_genes/1/sample1/genes1.fa new file mode 100644 index 00000000..e00ad6c1 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_genes/1/sample1/genes1.fa @@ -0,0 +1,9 @@ +>gene1 some_description1 +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene2 some_description2 +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGG +>gene3 some_description3 +GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA +CTGGAGG diff --git a/q2_types/genome_data/tests/data/partitioned_genes/2/sample1/genes2.fa b/q2_types/genome_data/tests/data/partitioned_genes/2/sample1/genes2.fa new file mode 100644 index 00000000..dd12c09a --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_genes/2/sample1/genes2.fa @@ -0,0 +1,6 @@ +>gene11 some_description11 +ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene12 some_description12 +AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA +CGGGGGGCCTTGG diff --git a/q2_types/genome_data/tests/data/partitioned_proteins/1/proteins1.faa b/q2_types/genome_data/tests/data/partitioned_proteins/1/proteins1.faa new file mode 100644 index 00000000..2271b842 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_proteins/1/proteins1.faa @@ -0,0 +1,6 @@ +>k129_5480_1 # 3 # 1988 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442 +MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYRVVLINSNPATIMTDPNM +ADATYIEPITP +>k129_5480_2 # 2150 # 2623 # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426 +MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLSENAEYHAAREQQSFIEG +RIEQVEAILSLAEIIDPAK diff --git a/q2_types/genome_data/tests/data/partitioned_proteins/2/proteins2.faa b/q2_types/genome_data/tests/data/partitioned_proteins/2/proteins2.faa new file mode 100644 index 00000000..4423cfa8 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_proteins/2/proteins2.faa @@ -0,0 +1,6 @@ +>k129_5112_1 # 1 # 1218 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.669 +MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTATAEEIAAKDPIGIILSG +GPSSVYEPGAPTLDPAVFDLGVP +>k129_5112_2 # 1261 # 1797 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.650 +MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYSPLHVELFAGGSGGFLWF +APVLVG diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index bfb7c99c..ed8e0a57 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -15,14 +15,19 @@ from q2_types.feature_data import DNAFASTAFormat -from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ - partition_orthologs, OrthologAnnotationDirFmt, \ - collate_ortholog_annotations, GenomeSequencesDirectoryFormat -from q2_types.genome_data import LociDirectoryFormat -from q2_types.genome_data._methods import collate_loci, collate_genomes - - -class TestOrthologsPartitionCollating(TestPluginBase): +from q2_types.genome_data import ( + SeedOrthologDirFmt, OrthologAnnotationDirFmt, + GenomeSequencesDirectoryFormat, GenesDirectoryFormat, + ProteinsDirectoryFormat, LociDirectoryFormat +) +from q2_types.genome_data._methods import ( + collate_loci, collate_genomes, collate_orthologs, partition_orthologs, + collate_genes, collate_proteins, collate_ortholog_annotations +) +from q2_types._util import _collate_helper + + +class TestPartitionCollating(TestPluginBase): package = "q2_types.genome_data.tests" def test_collate_orthologs(self): @@ -33,13 +38,78 @@ def test_collate_orthologs(self): SeedOrthologDirFmt(p2, mode="r") ] - collated_orthologs = collate_orthologs(orthologs) + collated = collate_orthologs(orthologs) + self.assertTrue(os.path.exists( + collated.path / "1.emapper.seed_orthologs") + ) + self.assertTrue(os.path.exists( + collated.path / "2.emapper.seed_orthologs") + ) + self.assertIsInstance(collated, SeedOrthologDirFmt) + + def test_collate_helper(self): + p1 = self.get_data_path("partitioned_genes/1/sample1") + p2 = self.get_data_path("partitioned_genes/2/sample1") + dir_fmts = [ + GenesDirectoryFormat(p1, mode="r"), + GenesDirectoryFormat(p2, mode="r") + ] + collated = _collate_helper(dir_fmts) + self.assertTrue(os.path.exists( + collated.path / "genes1.fa") + ) + self.assertTrue(os.path.exists( + collated.path / "genes2.fa") + ) + self.assertIsInstance(collated, GenesDirectoryFormat) + + def test_collate_helper_sample_data(self): + p1 = self.get_data_path("partitioned_genes/1") + p2 = self.get_data_path("partitioned_genes/2") + dir_fmts = [ + GenesDirectoryFormat(p1, mode="r"), + GenesDirectoryFormat(p2, mode="r") + ] + collated = _collate_helper(dir_fmts) + self.assertTrue(os.path.exists( + collated.path / "sample1" / "genes1.fa") + ) + self.assertTrue(os.path.exists( + collated.path / "sample1" / "genes2.fa") + ) + self.assertIsInstance(collated, GenesDirectoryFormat) + + def test_collate_genes(self): + p1 = self.get_data_path("partitioned_genes/1/sample1") + p2 = self.get_data_path("partitioned_genes/2/sample1") + genes = [ + GenesDirectoryFormat(p1, mode="r"), + GenesDirectoryFormat(p2, mode="r") + ] + collated = collate_genes(genes) + self.assertTrue(os.path.exists( + collated.path / "genes1.fa") + ) self.assertTrue(os.path.exists( - collated_orthologs.path / "1.emapper.seed_orthologs") + collated.path / "genes2.fa") + ) + self.assertIsInstance(collated, GenesDirectoryFormat) + + def test_collate_proteins(self): + p1 = self.get_data_path("partitioned_proteins/1") + p2 = self.get_data_path("partitioned_proteins/2") + proteins = [ + ProteinsDirectoryFormat(p1, mode="r"), + ProteinsDirectoryFormat(p2, mode="r") + ] + collated = collate_proteins(proteins) + self.assertTrue(os.path.exists( + collated.path / "proteins1.faa") ) self.assertTrue(os.path.exists( - collated_orthologs.path / "2.emapper.seed_orthologs") + collated.path / "proteins2.faa") ) + self.assertIsInstance(collated, ProteinsDirectoryFormat) def test_collate_loci(self): p1 = self.get_data_path("uncollated_loci_1") @@ -53,20 +123,6 @@ def test_collate_loci(self): self.assertTrue(all(os.path.exists( collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) - def test_collate_loci_file_exists(self): - p1 = self.get_data_path("uncollated_loci_1") - loci_list = [ - LociDirectoryFormat(p1, mode="r"), - LociDirectoryFormat(p1, mode="r") - ] - - with warnings.catch_warnings(record=True) as w: - collated_loci = collate_loci(loci_list) - self.assertIn("File already exists", str(w[-1].message)) - - self.assertTrue(all(os.path.exists( - collated_loci.path / f"loci{no}.gff") for no in [1, 2])) - def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") @@ -159,7 +215,7 @@ def helper_test_collate_genomes_dnafastaformat(self, input): expected_desc = content[expected_id]["description"] expected_sequence = content[expected_id]["sequence"] - self.assertEquals(actual_id, expected_id) + self.assertEqual(actual_id, expected_id) self.assertEqual(actual_description, expected_desc) self.assertEqual(actual_sequence, expected_sequence) @@ -240,7 +296,7 @@ def helper_test_collate_genomes_duplicates_warn(self, dir_fmt): expected_desc = content[expected_id]["description"] exp_sequence = content[expected_id]["sequence"] - self.assertEquals(actual_id, expected_id) + self.assertEqual(actual_id, expected_id) self.assertEqual(actual_description, expected_desc) self.assertEqual(actual_sequence, exp_sequence) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 62c26006..f7068f00 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -20,14 +20,17 @@ from q2_types.feature_data_mag import MAG import q2_types.kraken2 -from q2_types.per_sample_sequences import (MAGs, - JoinedSequencesWithQuality, - SequencesWithQuality, - PairedEndSequencesWithQuality, - Contigs) +from q2_types.per_sample_sequences import ( + MAGs, JoinedSequencesWithQuality, SequencesWithQuality, + PairedEndSequencesWithQuality, Contigs +) from q2_types.feature_data import FeatureData, Sequence -from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci, DNASequence -from q2_types.genome_data._methods import collate_loci +from q2_types.genome_data import ( + Orthologs, GenomeData, NOG, Loci, DNASequence, Genes, Proteins +) +from q2_types.genome_data._methods import ( + collate_loci, collate_genes, collate_proteins +) from q2_types.sample_data import SampleData from q2_types.kraken2 import Kraken2Reports, Kraken2Outputs @@ -210,6 +213,30 @@ "and collates them into a single artifact.", ) +plugin.methods.register_function( + function=collate_genes, + inputs={"genes": List[GenomeData[Genes]]}, + parameters={}, + outputs={"collated_genes": GenomeData[Genes]}, + input_descriptions={"genes": "A collection of genes to be collated."}, + name="Collate genes", + description="Takes a collection of GenomeData[Genes]'s " + "and collates them into a single artifact.", +) + +plugin.methods.register_function( + function=collate_proteins, + inputs={"proteins": List[GenomeData[Proteins]]}, + parameters={}, + outputs={"collated_proteins": GenomeData[Proteins]}, + input_descriptions={ + "proteins": "A collection of proteins to be collated." + }, + name="Collate proteins", + description="Takes a collection of GenomeData[Proteins] " + "and collates them into a single artifact.", +) + KRAKEN2_REPORTS = TypeMatch([ SampleData[Kraken2Reports % Properties('reads')], SampleData[Kraken2Reports % Properties('contigs')], diff --git a/q2_types/reference_db/tests/test_formats.py b/q2_types/reference_db/tests/test_formats.py index 323863a9..004a597e 100644 --- a/q2_types/reference_db/tests/test_formats.py +++ b/q2_types/reference_db/tests/test_formats.py @@ -41,7 +41,7 @@ def test_dmnd_dir_fmt_fails_bad_name(self): self.get_data_path('bad_dmnd_db'), mode='r' ) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValidationError, "Missing one or more files for DiamondDatabaseDirFmt"): dmnd_obj.validate() diff --git a/q2_types/tests/test_util.py b/q2_types/tests/test_util.py index 0c495eaf..4635350c 100644 --- a/q2_types/tests/test_util.py +++ b/q2_types/tests/test_util.py @@ -6,14 +6,17 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import warnings from pathlib import Path from q2_types.kraken2 import Kraken2OutputDirectoryFormat from qiime2.plugin import model from qiime2.plugin.testing import TestPluginBase -from q2_types._util import _validate_num_partitions, _validate_mag_ids, \ - FileDictMixin +from q2_types._util import ( + _validate_num_partitions, _validate_mag_ids, FileDictMixin, + _duplicate_with_warning +) class TestUtil(TestPluginBase): @@ -53,6 +56,19 @@ def test_validate_mag_ids_invalid(self): [(0, "a"), (0, "a"), (0, "c"), (0, "d"), (0, "e"), (0, "f")] ) + def test_duplicate_warning(self): + tmpdir = self.temp_dir.name + src = os.path.join(tmpdir, "file.txt") + dst = os.path.join(tmpdir, "file_copy.txt") + with open(src, "w"), open(dst, "w"): + pass + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + _duplicate_with_warning(src, dst) + + self.assertIn("File already exists", str(w[-1].message)) + class TestFileDictMixin(TestPluginBase): package = "q2_types.tests"