From 6e3faa674d682be1ca8a28a44a8c564e9572122c Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 17 Sep 2025 16:35:54 +0200 Subject: [PATCH 01/15] added new collation actions for proteins and genes --- q2_types/genome_data/__init__.py | 5 +- q2_types/genome_data/_methods.py | 61 ++++++++----------- .../tests/data/partitioned_genes/1/genes1.fa | 9 +++ .../tests/data/partitioned_genes/2/genes2.fa | 6 ++ .../1/sample1/proteins1.faa | 6 ++ .../2/sample1/proteins2.faa | 6 ++ q2_types/genome_data/tests/test_methods.py | 54 ++++++++++------ 7 files changed, 91 insertions(+), 56 deletions(-) create mode 100644 q2_types/genome_data/tests/data/partitioned_genes/1/genes1.fa create mode 100644 q2_types/genome_data/tests/data/partitioned_genes/2/genes2.fa create mode 100644 q2_types/genome_data/tests/data/partitioned_proteins/1/sample1/proteins1.faa create mode 100644 q2_types/genome_data/tests/data/partitioned_proteins/2/sample1/proteins2.faa diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index 1de55bd7..1d2ddec9 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -17,7 +17,7 @@ GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG ) from ._methods import collate_orthologs, partition_orthologs, \ - collate_ortholog_annotations + collate_ortholog_annotations, collate_loci, collate_genes __all__ = [ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', @@ -25,5 +25,6 @@ 'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs', 'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence', 'OrthologAnnotationDirFmt', 'NOG', - 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations" + 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations", + 'collate_loci', 'collate_genes' ] diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 9fdce2dc..de3d829a 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -13,33 +13,35 @@ from qiime2.util import duplicate from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt, - LociDirectoryFormat) + LociDirectoryFormat, GenesDirectoryFormat, + ProteinsDirectoryFormat) def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: - """ - Collate the individual loci directories from the partitions. - Parameters: - - loci: A list of LociDirectoryFormat containing the gff files. - Returns: - - collated_loci: A LociDirectoryFormat object containing the - collated gff files. - """ - collated_loci = LociDirectoryFormat() - for loci_dir in loci: - for fp in loci_dir.path.iterdir(): - try: - duplicate( - fp, - collated_loci.path / os.path.basename(fp) - ) - except FileExistsError: - warnings.warn( - f"Skipping {fp}. File already exists " - f"in the destination directory." - ) - return collated_loci + return collate_helper(dir_fmts=loci, collated=LociDirectoryFormat()) +def collate_ortholog_annotations( + ortholog_annotations: OrthologAnnotationDirFmt +) -> OrthologAnnotationDirFmt: + return collate_helper(dir_fmts=ortholog_annotations, collated=OrthologAnnotationDirFmt()) + +def collate_genes(genes: GenesDirectoryFormat) -> GenesDirectoryFormat: + return collate_helper(dir_fmts=genes, collated=GenesDirectoryFormat()) + +def collate_proteins(proteins: ProteinsDirectoryFormat) -> ProteinsDirectoryFormat: + return collate_helper(dir_fmts=proteins, collated=ProteinsDirectoryFormat()) + +def collate_helper(dir_fmts, collated): + for dir_fmt in dir_fmts: + for item in dir_fmt.path.iterdir(): + target = collated.path / item.name + if item.is_dir(): + target.mkdir(exist_ok=True) + for file in item.iterdir(): + duplicate(file, target / file.name) + else: + duplicate(item,collated.path / os.path.basename(item)) + return collated def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: result = SeedOrthologDirFmt() @@ -99,16 +101,3 @@ def partition_orthologs( return partitioned_orthologs - -def collate_ortholog_annotations( - ortholog_annotations: OrthologAnnotationDirFmt -) -> OrthologAnnotationDirFmt: - # Init output - collated_annotations = OrthologAnnotationDirFmt() - - # Copy annotations into output - for anno in ortholog_annotations: - for fp in anno.path.iterdir(): - duplicate(fp, collated_annotations.path / fp.name) - - return collated_annotations diff --git a/q2_types/genome_data/tests/data/partitioned_genes/1/genes1.fa b/q2_types/genome_data/tests/data/partitioned_genes/1/genes1.fa new file mode 100644 index 00000000..e00ad6c1 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_genes/1/genes1.fa @@ -0,0 +1,9 @@ +>gene1 some_description1 +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene2 some_description2 +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGG +>gene3 some_description3 +GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA +CTGGAGG diff --git a/q2_types/genome_data/tests/data/partitioned_genes/2/genes2.fa b/q2_types/genome_data/tests/data/partitioned_genes/2/genes2.fa new file mode 100644 index 00000000..dd12c09a --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_genes/2/genes2.fa @@ -0,0 +1,6 @@ +>gene11 some_description11 +ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene12 some_description12 +AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA +CGGGGGGCCTTGG diff --git a/q2_types/genome_data/tests/data/partitioned_proteins/1/sample1/proteins1.faa b/q2_types/genome_data/tests/data/partitioned_proteins/1/sample1/proteins1.faa new file mode 100644 index 00000000..2271b842 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_proteins/1/sample1/proteins1.faa @@ -0,0 +1,6 @@ +>k129_5480_1 # 3 # 1988 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442 +MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYRVVLINSNPATIMTDPNM +ADATYIEPITP +>k129_5480_2 # 2150 # 2623 # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426 +MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLSENAEYHAAREQQSFIEG +RIEQVEAILSLAEIIDPAK diff --git a/q2_types/genome_data/tests/data/partitioned_proteins/2/sample1/proteins2.faa b/q2_types/genome_data/tests/data/partitioned_proteins/2/sample1/proteins2.faa new file mode 100644 index 00000000..4423cfa8 --- /dev/null +++ b/q2_types/genome_data/tests/data/partitioned_proteins/2/sample1/proteins2.faa @@ -0,0 +1,6 @@ +>k129_5112_1 # 1 # 1218 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.669 +MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTATAEEIAAKDPIGIILSG +GPSSVYEPGAPTLDPAVFDLGVP +>k129_5112_2 # 1261 # 1797 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.650 +MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYSPLHVELFAGGSGGFLWF +APVLVG diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 754bd5f1..f170241a 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -7,17 +7,17 @@ # ---------------------------------------------------------------------------- import filecmp import os -import warnings from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ - partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations + partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations, \ + GenesDirectoryFormat, ProteinsDirectoryFormat, collate_loci, collate_genes from q2_types.genome_data import LociDirectoryFormat -from q2_types.genome_data._methods import collate_loci +from q2_types.genome_data._methods import collate_loci, collate_genes -class TestOrthologsPartitionCollating(TestPluginBase): +class TestPartitionCollating(TestPluginBase): package = "q2_types.genome_data.tests" def test_collate_orthologs(self): @@ -35,6 +35,38 @@ def test_collate_orthologs(self): self.assertTrue(os.path.exists( collated_orthologs.path / "2.emapper.seed_orthologs") ) + + def test_collate_genes(self): + p1 = self.get_data_path("partitioned_genes/1") + p2 = self.get_data_path("partitioned_genes/2") + + collated = collate_genes( + [ + GenesDirectoryFormat(p1, mode="r"), + GenesDirectoryFormat(p2, mode="r") + ]) + self.assertTrue(os.path.exists( + collated.path / "genes1.fa") + ) + self.assertTrue(os.path.exists( + collated.path / "genes2.fa") + ) + + def test_collate_proteins_per_sample(self): + p1 = self.get_data_path("partitioned_proteins/1") + p2 = self.get_data_path("partitioned_proteins/2") + + collated = collate_genes( + [ + ProteinsDirectoryFormat(p1, mode="r"), + ProteinsDirectoryFormat(p2, mode="r") + ]) + self.assertTrue(os.path.exists( + collated.path / "sample1" / "proteins1.faa") + ) + self.assertTrue(os.path.exists( + collated.path / "sample1" / "proteins2.faa") + ) def test_collate_loci(self): p1 = self.get_data_path("uncollated_loci_1") @@ -48,20 +80,6 @@ def test_collate_loci(self): self.assertTrue(all(os.path.exists( collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) - def test_collate_loci_file_exists(self): - p1 = self.get_data_path("uncollated_loci_1") - loci_list = [ - LociDirectoryFormat(p1, mode="r"), - LociDirectoryFormat(p1, mode="r") - ] - - with warnings.catch_warnings(record=True) as w: - collated_loci = collate_loci(loci_list) - self.assertIn("File already exists", str(w[-1].message)) - - self.assertTrue(all(os.path.exists( - collated_loci.path / f"loci{no}.gff") for no in [1, 2])) - def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") From 7cf9765bd5abcd0db2915295fb1297b6d06fc6f2 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 17 Sep 2025 16:41:49 +0200 Subject: [PATCH 02/15] added plugin setup --- q2_types/genome_data/_methods.py | 4 +++- q2_types/plugin_setup.py | 26 ++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index de3d829a..2e21e083 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -23,7 +23,9 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: def collate_ortholog_annotations( ortholog_annotations: OrthologAnnotationDirFmt ) -> OrthologAnnotationDirFmt: - return collate_helper(dir_fmts=ortholog_annotations, collated=OrthologAnnotationDirFmt()) + return collate_helper( + dir_fmts=ortholog_annotations, + collated=OrthologAnnotationDirFmt()) def collate_genes(genes: GenesDirectoryFormat) -> GenesDirectoryFormat: return collate_helper(dir_fmts=genes, collated=GenesDirectoryFormat()) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 515b35d1..ba255d6d 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -25,8 +25,8 @@ SequencesWithQuality, PairedEndSequencesWithQuality) from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci -from q2_types.genome_data._methods import collate_loci +from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci, Genes, Proteins +from q2_types.genome_data._methods import collate_loci, collate_genes, collate_proteins from q2_types.sample_data import SampleData from q2_types.kraken2 import Kraken2Reports, Kraken2Outputs @@ -209,6 +209,28 @@ "and collates them into a single artifact.", ) +plugin.methods.register_function( + function=collate_genes, + inputs={"genes": List[GenomeData[Genes]]}, + parameters={}, + outputs={"collated_genes": GenomeData[Genes]}, + input_descriptions={"genes": "A collection of genes to be collated."}, + name="Collate genes", + description="Takes a collection of GenomeData[Genes]'s " + "and collates them into a single artifact.", +) + +plugin.methods.register_function( + function=collate_proteins, + inputs={"proteins": List[GenomeData[Proteins]]}, + parameters={}, + outputs={"collated_proteins": GenomeData[Proteins]}, + input_descriptions={"proteins": "A collection of proteins to be collated."}, + name="Collate proteins", + description="Takes a collection of GenomeData[Proteins] " + "and collates them into a single artifact.", +) + KRAKEN2_REPORTS = TypeMatch([ SampleData[Kraken2Reports % Properties('reads')], SampleData[Kraken2Reports % Properties('contigs')], From 9b00a1008574eb82caf99b5a42c07ffe15726737 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 17 Sep 2025 17:09:23 +0200 Subject: [PATCH 03/15] lint --- q2_types/genome_data/__init__.py | 4 ++-- q2_types/genome_data/_methods.py | 22 ++++++++++++++------ q2_types/genome_data/tests/test_methods.py | 24 ++++++++++------------ q2_types/plugin_setup.py | 5 +++-- 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index 1d2ddec9..8a8bf69f 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -17,7 +17,7 @@ GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG ) from ._methods import collate_orthologs, partition_orthologs, \ - collate_ortholog_annotations, collate_loci, collate_genes + collate_ortholog_annotations, collate_loci, collate_genes, collate_proteins __all__ = [ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', @@ -26,5 +26,5 @@ 'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence', 'OrthologAnnotationDirFmt', 'NOG', 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations", - 'collate_loci', 'collate_genes' + 'collate_loci', 'collate_genes', "collate_proteins" ] diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 2e21e083..e311e559 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -20,18 +20,28 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: return collate_helper(dir_fmts=loci, collated=LociDirectoryFormat()) + def collate_ortholog_annotations( ortholog_annotations: OrthologAnnotationDirFmt ) -> OrthologAnnotationDirFmt: return collate_helper( - dir_fmts=ortholog_annotations, + dir_fmts=ortholog_annotations, collated=OrthologAnnotationDirFmt()) -def collate_genes(genes: GenesDirectoryFormat) -> GenesDirectoryFormat: - return collate_helper(dir_fmts=genes, collated=GenesDirectoryFormat()) -def collate_proteins(proteins: ProteinsDirectoryFormat) -> ProteinsDirectoryFormat: - return collate_helper(dir_fmts=proteins, collated=ProteinsDirectoryFormat()) +def collate_genes(genes: GenesDirectoryFormat) -> ( + GenesDirectoryFormat): + return collate_helper( + dir_fmts=genes, + collated=GenesDirectoryFormat()) + + +def collate_proteins(proteins: ProteinsDirectoryFormat) -> ( + ProteinsDirectoryFormat): + return collate_helper( + dir_fmts=proteins, + collated=ProteinsDirectoryFormat()) + def collate_helper(dir_fmts, collated): for dir_fmt in dir_fmts: @@ -45,6 +55,7 @@ def collate_helper(dir_fmts, collated): duplicate(item,collated.path / os.path.basename(item)) return collated + def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: result = SeedOrthologDirFmt() @@ -102,4 +113,3 @@ def partition_orthologs( partitioned_orthologs[i] = result return partitioned_orthologs - diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index f170241a..b0534711 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -11,10 +11,10 @@ from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ - partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations, \ - GenesDirectoryFormat, ProteinsDirectoryFormat, collate_loci, collate_genes + partition_orthologs, OrthologAnnotationDirFmt, \ + collate_ortholog_annotations, GenesDirectoryFormat, \ + ProteinsDirectoryFormat, collate_loci, collate_genes from q2_types.genome_data import LociDirectoryFormat -from q2_types.genome_data._methods import collate_loci, collate_genes class TestPartitionCollating(TestPluginBase): @@ -35,32 +35,30 @@ def test_collate_orthologs(self): self.assertTrue(os.path.exists( collated_orthologs.path / "2.emapper.seed_orthologs") ) - + def test_collate_genes(self): p1 = self.get_data_path("partitioned_genes/1") p2 = self.get_data_path("partitioned_genes/2") - - collated = collate_genes( - [ + genes = [ GenesDirectoryFormat(p1, mode="r"), GenesDirectoryFormat(p2, mode="r") - ]) + ] + collated = collate_genes(genes) self.assertTrue(os.path.exists( collated.path / "genes1.fa") ) self.assertTrue(os.path.exists( collated.path / "genes2.fa") ) - + def test_collate_proteins_per_sample(self): p1 = self.get_data_path("partitioned_proteins/1") p2 = self.get_data_path("partitioned_proteins/2") - - collated = collate_genes( - [ + proteins = [ ProteinsDirectoryFormat(p1, mode="r"), ProteinsDirectoryFormat(p2, mode="r") - ]) + ] + collated = collate_genes(proteins) self.assertTrue(os.path.exists( collated.path / "sample1" / "proteins1.faa") ) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index ba255d6d..e36b9aed 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -25,8 +25,9 @@ SequencesWithQuality, PairedEndSequencesWithQuality) from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci, Genes, Proteins -from q2_types.genome_data._methods import collate_loci, collate_genes, collate_proteins +from q2_types.genome_data import ( + Orthologs, GenomeData, NOG, Loci, Genes, Proteins) +from q2_types.genome_data import collate_loci, collate_genes, collate_proteins from q2_types.sample_data import SampleData from q2_types.kraken2 import Kraken2Reports, Kraken2Outputs From a7a5f35924243a36b8ea6534f06c9551f72eb024 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 17 Sep 2025 17:17:25 +0200 Subject: [PATCH 04/15] lint --- q2_types/genome_data/_methods.py | 2 +- q2_types/genome_data/tests/test_methods.py | 2 +- q2_types/plugin_setup.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index e311e559..0818d61f 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -52,7 +52,7 @@ def collate_helper(dir_fmts, collated): for file in item.iterdir(): duplicate(file, target / file.name) else: - duplicate(item,collated.path / os.path.basename(item)) + duplicate(item, collated.path / os.path.basename(item)) return collated diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index b0534711..3caeb318 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -11,7 +11,7 @@ from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ - partition_orthologs, OrthologAnnotationDirFmt, \ + partition_orthologs, OrthologAnnotationDirFmt, \ collate_ortholog_annotations, GenesDirectoryFormat, \ ProteinsDirectoryFormat, collate_loci, collate_genes from q2_types.genome_data import LociDirectoryFormat diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index e36b9aed..9ceb3c7f 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -226,7 +226,9 @@ inputs={"proteins": List[GenomeData[Proteins]]}, parameters={}, outputs={"collated_proteins": GenomeData[Proteins]}, - input_descriptions={"proteins": "A collection of proteins to be collated."}, + input_descriptions={ + "proteins": "A collection of proteins to be collated." + }, name="Collate proteins", description="Takes a collection of GenomeData[Proteins] " "and collates them into a single artifact.", From e187ad1247844bbbc7cbb8e7bb56e5decdb4a6a6 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 23 Sep 2025 15:56:21 +0200 Subject: [PATCH 05/15] added skipping if file already exists and warning message --- q2_types/genome_data/_methods.py | 22 +++++++++++++++------- q2_types/genome_data/tests/test_methods.py | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 0818d61f..edc8acd8 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -18,41 +18,49 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: - return collate_helper(dir_fmts=loci, collated=LociDirectoryFormat()) + return _collate_helper(dir_fmts=loci, collated=LociDirectoryFormat()) def collate_ortholog_annotations( ortholog_annotations: OrthologAnnotationDirFmt ) -> OrthologAnnotationDirFmt: - return collate_helper( + return _collate_helper( dir_fmts=ortholog_annotations, collated=OrthologAnnotationDirFmt()) def collate_genes(genes: GenesDirectoryFormat) -> ( GenesDirectoryFormat): - return collate_helper( + return _collate_helper( dir_fmts=genes, collated=GenesDirectoryFormat()) def collate_proteins(proteins: ProteinsDirectoryFormat) -> ( ProteinsDirectoryFormat): - return collate_helper( + return _collate_helper( dir_fmts=proteins, collated=ProteinsDirectoryFormat()) -def collate_helper(dir_fmts, collated): +def _duplicate_warning(src, dst): + try: + duplicate(src, dst) + except FileExistsError: + warnings.warn( + f"Skipping {src}. File already exists in the destination directory." + ) + +def _collate_helper(dir_fmts, collated): for dir_fmt in dir_fmts: for item in dir_fmt.path.iterdir(): target = collated.path / item.name if item.is_dir(): target.mkdir(exist_ok=True) for file in item.iterdir(): - duplicate(file, target / file.name) + _duplicate_warning(file, target / file.name) else: - duplicate(item, collated.path / os.path.basename(item)) + _duplicate_warning(item, collated.path / os.path.basename(item)) return collated diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 3caeb318..4b6eb7d3 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- import filecmp import os +import warnings from qiime2.plugin.testing import TestPluginBase @@ -15,6 +16,7 @@ collate_ortholog_annotations, GenesDirectoryFormat, \ ProteinsDirectoryFormat, collate_loci, collate_genes from q2_types.genome_data import LociDirectoryFormat +from q2_types.genome_data._methods import _duplicate_warning class TestPartitionCollating(TestPluginBase): @@ -78,6 +80,21 @@ def test_collate_loci(self): self.assertTrue(all(os.path.exists( collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) + + def test_duplicate_warning(self): + tmpdir = self.temp_dir.name + src = os.path.join(tmpdir, "file.txt") + dst = os.path.join(tmpdir, "file_copy.txt") + with open(src, "w"), open(dst, "w"): + pass + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + _duplicate_warning(src, dst) + + self.assertIn("File already exists", str(w[-1].message)) + + def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") From 12fcfe23d055760b930c84b36a2128d0f9791504 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 23 Sep 2025 15:57:03 +0200 Subject: [PATCH 06/15] lint --- q2_types/genome_data/_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index edc8acd8..b9960c32 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -51,6 +51,7 @@ def _duplicate_warning(src, dst): f"Skipping {src}. File already exists in the destination directory." ) + def _collate_helper(dir_fmts, collated): for dir_fmt in dir_fmts: for item in dir_fmt.path.iterdir(): From 604ecace9debbdef5e4652d2848eb6f9dd541fcd Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 23 Sep 2025 16:11:06 +0200 Subject: [PATCH 07/15] lint --- q2_types/genome_data/_methods.py | 7 +++++-- q2_types/genome_data/tests/test_methods.py | 2 -- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index b9960c32..8f132eec 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -48,7 +48,8 @@ def _duplicate_warning(src, dst): duplicate(src, dst) except FileExistsError: warnings.warn( - f"Skipping {src}. File already exists in the destination directory." + f"Skipping {src}. File already " + f"exists in the destination directory." ) @@ -61,7 +62,9 @@ def _collate_helper(dir_fmts, collated): for file in item.iterdir(): _duplicate_warning(file, target / file.name) else: - _duplicate_warning(item, collated.path / os.path.basename(item)) + _duplicate_warning( + item, collated.path / os.path.basename(item) + ) return collated diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 4b6eb7d3..33fd7372 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -80,7 +80,6 @@ def test_collate_loci(self): self.assertTrue(all(os.path.exists( collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) - def test_duplicate_warning(self): tmpdir = self.temp_dir.name src = os.path.join(tmpdir, "file.txt") @@ -94,7 +93,6 @@ def test_duplicate_warning(self): self.assertIn("File already exists", str(w[-1].message)) - def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") From 3e28b72e85bc127096182608170c65072b88b5a6 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 3 Oct 2025 16:11:59 +0200 Subject: [PATCH 08/15] added docstring and new tests --- q2_types/genome_data/_methods.py | 35 ++++++++---- .../1/{ => sample1}/genes1.fa | 0 .../2/{ => sample1}/genes2.fa | 0 .../1/{sample1 => }/proteins1.faa | 0 .../2/{sample1 => }/proteins2.faa | 0 q2_types/genome_data/tests/test_methods.py | 54 +++++++++++++++---- 6 files changed, 69 insertions(+), 20 deletions(-) rename q2_types/genome_data/tests/data/partitioned_genes/1/{ => sample1}/genes1.fa (100%) rename q2_types/genome_data/tests/data/partitioned_genes/2/{ => sample1}/genes2.fa (100%) rename q2_types/genome_data/tests/data/partitioned_proteins/1/{sample1 => }/proteins1.faa (100%) rename q2_types/genome_data/tests/data/partitioned_proteins/2/{sample1 => }/proteins2.faa (100%) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 8f132eec..15d276e7 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -18,29 +18,23 @@ def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: - return _collate_helper(dir_fmts=loci, collated=LociDirectoryFormat()) + return _collate_helper(dir_fmts=loci) def collate_ortholog_annotations( ortholog_annotations: OrthologAnnotationDirFmt ) -> OrthologAnnotationDirFmt: - return _collate_helper( - dir_fmts=ortholog_annotations, - collated=OrthologAnnotationDirFmt()) + return _collate_helper(dir_fmts=ortholog_annotations) def collate_genes(genes: GenesDirectoryFormat) -> ( GenesDirectoryFormat): - return _collate_helper( - dir_fmts=genes, - collated=GenesDirectoryFormat()) + return _collate_helper(dir_fmts=genes) def collate_proteins(proteins: ProteinsDirectoryFormat) -> ( ProteinsDirectoryFormat): - return _collate_helper( - dir_fmts=proteins, - collated=ProteinsDirectoryFormat()) + return _collate_helper(dir_fmts=proteins) def _duplicate_warning(src, dst): @@ -53,14 +47,33 @@ def _duplicate_warning(src, dst): ) -def _collate_helper(dir_fmts, collated): +def _collate_helper(dir_fmts: list): + """ + Iterates through a list of directory formats, merging their contents + into a single directory. Can be used with per sample directories and + without. Handles duplicate files by issuing warnings when conflicts occur. + + Parameters: + dir_fmts (iterable): + A List of directory format objects to be collated. + + Returns: + object: + The updated `collated` directory format object containing all + merged files and subdirectories. + """ + # Initialize the collated directory format with the same class as inputs + collated = dir_fmts[0].__class__() + for dir_fmt in dir_fmts: for item in dir_fmt.path.iterdir(): target = collated.path / item.name + # Per sample directories if item.is_dir(): target.mkdir(exist_ok=True) for file in item.iterdir(): _duplicate_warning(file, target / file.name) + # Non per sample directories else: _duplicate_warning( item, collated.path / os.path.basename(item) diff --git a/q2_types/genome_data/tests/data/partitioned_genes/1/genes1.fa b/q2_types/genome_data/tests/data/partitioned_genes/1/sample1/genes1.fa similarity index 100% rename from q2_types/genome_data/tests/data/partitioned_genes/1/genes1.fa rename to q2_types/genome_data/tests/data/partitioned_genes/1/sample1/genes1.fa diff --git a/q2_types/genome_data/tests/data/partitioned_genes/2/genes2.fa b/q2_types/genome_data/tests/data/partitioned_genes/2/sample1/genes2.fa similarity index 100% rename from q2_types/genome_data/tests/data/partitioned_genes/2/genes2.fa rename to q2_types/genome_data/tests/data/partitioned_genes/2/sample1/genes2.fa diff --git a/q2_types/genome_data/tests/data/partitioned_proteins/1/sample1/proteins1.faa b/q2_types/genome_data/tests/data/partitioned_proteins/1/proteins1.faa similarity index 100% rename from q2_types/genome_data/tests/data/partitioned_proteins/1/sample1/proteins1.faa rename to q2_types/genome_data/tests/data/partitioned_proteins/1/proteins1.faa diff --git a/q2_types/genome_data/tests/data/partitioned_proteins/2/sample1/proteins2.faa b/q2_types/genome_data/tests/data/partitioned_proteins/2/proteins2.faa similarity index 100% rename from q2_types/genome_data/tests/data/partitioned_proteins/2/sample1/proteins2.faa rename to q2_types/genome_data/tests/data/partitioned_proteins/2/proteins2.faa diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 33fd7372..466034d2 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -16,7 +16,8 @@ collate_ortholog_annotations, GenesDirectoryFormat, \ ProteinsDirectoryFormat, collate_loci, collate_genes from q2_types.genome_data import LociDirectoryFormat -from q2_types.genome_data._methods import _duplicate_warning +from q2_types.genome_data._methods import _duplicate_warning, collate_proteins, \ + _collate_helper class TestPartitionCollating(TestPluginBase): @@ -30,17 +31,50 @@ def test_collate_orthologs(self): SeedOrthologDirFmt(p2, mode="r") ] - collated_orthologs = collate_orthologs(orthologs) + collated = collate_orthologs(orthologs) self.assertTrue(os.path.exists( - collated_orthologs.path / "1.emapper.seed_orthologs") + collated.path / "1.emapper.seed_orthologs") ) self.assertTrue(os.path.exists( - collated_orthologs.path / "2.emapper.seed_orthologs") + collated.path / "2.emapper.seed_orthologs") ) + self.assertIsInstance(collated, SeedOrthologDirFmt) - def test_collate_genes(self): + def test_collate_helper(self): + p1 = self.get_data_path("partitioned_genes/1/sample1") + p2 = self.get_data_path("partitioned_genes/2/sample1") + dir_fmts = [ + GenesDirectoryFormat(p1, mode="r"), + GenesDirectoryFormat(p2, mode="r") + ] + collated = _collate_helper(dir_fmts) + self.assertTrue(os.path.exists( + collated.path / "genes1.fa") + ) + self.assertTrue(os.path.exists( + collated.path / "genes2.fa") + ) + self.assertIsInstance(collated, GenesDirectoryFormat) + + def test_collate_helper_sample_data(self): p1 = self.get_data_path("partitioned_genes/1") p2 = self.get_data_path("partitioned_genes/2") + dir_fmts = [ + GenesDirectoryFormat(p1, mode="r"), + GenesDirectoryFormat(p2, mode="r") + ] + collated = _collate_helper(dir_fmts) + self.assertTrue(os.path.exists( + collated.path / "sample1" / "genes1.fa") + ) + self.assertTrue(os.path.exists( + collated.path / "sample1" / "genes2.fa") + ) + self.assertIsInstance(collated, GenesDirectoryFormat) + + def test_collate_genes(self): + p1 = self.get_data_path("partitioned_genes/1/sample1") + p2 = self.get_data_path("partitioned_genes/2/sample1") genes = [ GenesDirectoryFormat(p1, mode="r"), GenesDirectoryFormat(p2, mode="r") @@ -52,21 +86,23 @@ def test_collate_genes(self): self.assertTrue(os.path.exists( collated.path / "genes2.fa") ) + self.assertIsInstance(collated, GenesDirectoryFormat) - def test_collate_proteins_per_sample(self): + def test_collate_proteins(self): p1 = self.get_data_path("partitioned_proteins/1") p2 = self.get_data_path("partitioned_proteins/2") proteins = [ ProteinsDirectoryFormat(p1, mode="r"), ProteinsDirectoryFormat(p2, mode="r") ] - collated = collate_genes(proteins) + collated = collate_proteins(proteins) self.assertTrue(os.path.exists( - collated.path / "sample1" / "proteins1.faa") + collated.path / "proteins1.faa") ) self.assertTrue(os.path.exists( - collated.path / "sample1" / "proteins2.faa") + collated.path / "proteins2.faa") ) + self.assertIsInstance(collated, ProteinsDirectoryFormat) def test_collate_loci(self): p1 = self.get_data_path("uncollated_loci_1") From 9de650792d9c658c35dbf6831049c1814403470f Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 3 Oct 2025 16:36:56 +0200 Subject: [PATCH 09/15] lint --- q2_types/genome_data/_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 15d276e7..d00d7e11 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -50,11 +50,11 @@ def _duplicate_warning(src, dst): def _collate_helper(dir_fmts: list): """ Iterates through a list of directory formats, merging their contents - into a single directory. Can be used with per sample directories and + into a single directory. Can be used with per sample directories and without. Handles duplicate files by issuing warnings when conflicts occur. Parameters: - dir_fmts (iterable): + dir_fmts (iterable): A List of directory format objects to be collated. Returns: @@ -64,7 +64,7 @@ def _collate_helper(dir_fmts: list): """ # Initialize the collated directory format with the same class as inputs collated = dir_fmts[0].__class__() - + for dir_fmt in dir_fmts: for item in dir_fmt.path.iterdir(): target = collated.path / item.name From 31d0c6b6d153fd2a8b47b0a437832efb28b7503b Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 3 Oct 2025 16:39:02 +0200 Subject: [PATCH 10/15] lint --- q2_types/genome_data/tests/test_methods.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 466034d2..a5f48997 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -14,10 +14,9 @@ from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ partition_orthologs, OrthologAnnotationDirFmt, \ collate_ortholog_annotations, GenesDirectoryFormat, \ - ProteinsDirectoryFormat, collate_loci, collate_genes -from q2_types.genome_data import LociDirectoryFormat -from q2_types.genome_data._methods import _duplicate_warning, collate_proteins, \ - _collate_helper + ProteinsDirectoryFormat, collate_loci, collate_genes, collate_proteins, \ + LociDirectoryFormat +from q2_types.genome_data._methods import _duplicate_warning, _collate_helper class TestPartitionCollating(TestPluginBase): From d0afbfa571ce65015c5985a77cad815d02ba1760 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 13 Oct 2025 10:55:14 +0200 Subject: [PATCH 11/15] Fix tests and rename duplicate_warn --- q2_types/genome_data/_methods.py | 13 ++++++++----- q2_types/genome_data/tests/test_methods.py | 8 ++++---- q2_types/reference_db/tests/test_formats.py | 2 +- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 39cffc78..7a63b416 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -9,11 +9,12 @@ import os import shutil import warnings -from typing import Union +from typing import Union, TypeVar from warnings import warn import numpy as np import skbio +from qiime2.plugin import model from qiime2.util import duplicate from q2_types.feature_data import DNAIterator, DNAFASTAFormat @@ -22,6 +23,8 @@ GenomeSequencesDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat ) +DirFmt = TypeVar("DirFmt", bound=model.DirectoryFormat) + def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: return _collate_helper(dir_fmts=loci) @@ -43,7 +46,7 @@ def collate_proteins(proteins: ProteinsDirectoryFormat) -> ( return _collate_helper(dir_fmts=proteins) -def _duplicate_warning(src, dst): +def _duplicate_with_warning(src, dst): try: duplicate(src, dst) except FileExistsError: @@ -53,7 +56,7 @@ def _duplicate_warning(src, dst): ) -def _collate_helper(dir_fmts: list): +def _collate_helper(dir_fmts: DirFmt) -> DirFmt: """ Iterates through a list of directory formats, merging their contents into a single directory. Can be used with per sample directories and @@ -78,10 +81,10 @@ def _collate_helper(dir_fmts: list): if item.is_dir(): target.mkdir(exist_ok=True) for file in item.iterdir(): - _duplicate_warning(file, target / file.name) + _duplicate_with_warning(file, target / file.name) # Non per sample directories else: - _duplicate_warning( + _duplicate_with_warning( item, collated.path / os.path.basename(item) ) return collated diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index dff0a337..71432d2d 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -21,7 +21,7 @@ LociDirectoryFormat ) from q2_types.genome_data._methods import ( - collate_loci, collate_genomes, _duplicate_warning, _collate_helper, + collate_loci, collate_genomes, _duplicate_with_warning, _collate_helper, collate_orthologs, partition_orthologs, collate_genes, collate_proteins ) @@ -131,7 +131,7 @@ def test_duplicate_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - _duplicate_warning(src, dst) + _duplicate_with_warning(src, dst) self.assertIn("File already exists", str(w[-1].message)) @@ -227,7 +227,7 @@ def helper_test_collate_genomes_dnafastaformat(self, input): expected_desc = content[expected_id]["description"] expected_sequence = content[expected_id]["sequence"] - self.assertEquals(actual_id, expected_id) + self.assertEqual(actual_id, expected_id) self.assertEqual(actual_description, expected_desc) self.assertEqual(actual_sequence, expected_sequence) @@ -308,7 +308,7 @@ def helper_test_collate_genomes_duplicates_warn(self, dir_fmt): expected_desc = content[expected_id]["description"] exp_sequence = content[expected_id]["sequence"] - self.assertEquals(actual_id, expected_id) + self.assertEqual(actual_id, expected_id) self.assertEqual(actual_description, expected_desc) self.assertEqual(actual_sequence, exp_sequence) diff --git a/q2_types/reference_db/tests/test_formats.py b/q2_types/reference_db/tests/test_formats.py index 323863a9..004a597e 100644 --- a/q2_types/reference_db/tests/test_formats.py +++ b/q2_types/reference_db/tests/test_formats.py @@ -41,7 +41,7 @@ def test_dmnd_dir_fmt_fails_bad_name(self): self.get_data_path('bad_dmnd_db'), mode='r' ) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValidationError, "Missing one or more files for DiamondDatabaseDirFmt"): dmnd_obj.validate() From 585280a922060763c23c097d992106e3e0f19715 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 13 Oct 2025 10:59:05 +0200 Subject: [PATCH 12/15] Reshuffle the code --- q2_types/genome_data/_methods.py | 65 ++++++++++++++++---------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 7a63b416..2bbcd730 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -9,7 +9,7 @@ import os import shutil import warnings -from typing import Union, TypeVar +from typing import Union, TypeVar, List from warnings import warn import numpy as np @@ -26,37 +26,7 @@ DirFmt = TypeVar("DirFmt", bound=model.DirectoryFormat) -def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: - return _collate_helper(dir_fmts=loci) - - -def collate_ortholog_annotations( - ortholog_annotations: OrthologAnnotationDirFmt -) -> OrthologAnnotationDirFmt: - return _collate_helper(dir_fmts=ortholog_annotations) - - -def collate_genes(genes: GenesDirectoryFormat) -> ( - GenesDirectoryFormat): - return _collate_helper(dir_fmts=genes) - - -def collate_proteins(proteins: ProteinsDirectoryFormat) -> ( - ProteinsDirectoryFormat): - return _collate_helper(dir_fmts=proteins) - - -def _duplicate_with_warning(src, dst): - try: - duplicate(src, dst) - except FileExistsError: - warnings.warn( - f"Skipping {src}. File already " - f"exists in the destination directory." - ) - - -def _collate_helper(dir_fmts: DirFmt) -> DirFmt: +def _collate_helper(dir_fmts: List[DirFmt]) -> DirFmt: """ Iterates through a list of directory formats, merging their contents into a single directory. Can be used with per sample directories and @@ -90,6 +60,27 @@ def _collate_helper(dir_fmts: DirFmt) -> DirFmt: return collated +def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: + return _collate_helper(dir_fmts=loci) + + +def collate_ortholog_annotations( + ortholog_annotations: OrthologAnnotationDirFmt +) -> OrthologAnnotationDirFmt: + return _collate_helper(dir_fmts=ortholog_annotations) + + +def collate_genes(genes: GenesDirectoryFormat) -> ( + GenesDirectoryFormat): + return _collate_helper(dir_fmts=genes) + + +def collate_proteins( + proteins: ProteinsDirectoryFormat +) -> ProteinsDirectoryFormat: + return _collate_helper(dir_fmts=proteins) + + def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: result = SeedOrthologDirFmt() @@ -195,3 +186,13 @@ def collate_genomes( ) return genomes_dir + + +def _duplicate_with_warning(src, dst): + try: + duplicate(src, dst) + except FileExistsError: + warnings.warn( + f"Skipping {src}. File already " + f"exists in the destination directory." + ) From 5a172d019ec379158811fe141d722d34eeecb686 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 13 Oct 2025 11:02:22 +0200 Subject: [PATCH 13/15] Move duplicate method to the utils --- q2_types/_util.py | 11 +++++++++++ q2_types/genome_data/_methods.py | 11 +---------- q2_types/genome_data/tests/test_methods.py | 16 ++-------------- q2_types/tests/test_util.py | 20 ++++++++++++++++++-- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/q2_types/_util.py b/q2_types/_util.py index 4df29ebf..150c04dc 100644 --- a/q2_types/_util.py +++ b/q2_types/_util.py @@ -17,6 +17,7 @@ import qiime2.plugin.model as model from qiime2.plugin import ValidationError +from qiime2.util import duplicate def read_from_fasta(path, constructor=skbio.DNA, lowercase=False): @@ -233,3 +234,13 @@ class does not have a suffixes attribute, then the ID is defined to else path.absolute() ) return str(processed_path), _id + + +def _duplicate_with_warning(src, dst): + try: + duplicate(src, dst) + except FileExistsError: + warnings.warn( + f"Skipping {src}. File already " + f"exists in the destination directory." + ) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 2bbcd730..ae8a1033 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -17,6 +17,7 @@ from qiime2.plugin import model from qiime2.util import duplicate +from q2_types._util import _duplicate_with_warning from q2_types.feature_data import DNAIterator, DNAFASTAFormat from q2_types.genome_data import ( SeedOrthologDirFmt, OrthologAnnotationDirFmt, LociDirectoryFormat, @@ -186,13 +187,3 @@ def collate_genomes( ) return genomes_dir - - -def _duplicate_with_warning(src, dst): - try: - duplicate(src, dst) - except FileExistsError: - warnings.warn( - f"Skipping {src}. File already " - f"exists in the destination directory." - ) diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 71432d2d..8e6c4dfe 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -21,9 +21,10 @@ LociDirectoryFormat ) from q2_types.genome_data._methods import ( - collate_loci, collate_genomes, _duplicate_with_warning, _collate_helper, + collate_loci, collate_genomes, _collate_helper, collate_orthologs, partition_orthologs, collate_genes, collate_proteins ) +from q2_types._util import _duplicate_with_warning class TestPartitionCollating(TestPluginBase): @@ -122,19 +123,6 @@ def test_collate_loci(self): self.assertTrue(all(os.path.exists( collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) - def test_duplicate_warning(self): - tmpdir = self.temp_dir.name - src = os.path.join(tmpdir, "file.txt") - dst = os.path.join(tmpdir, "file_copy.txt") - with open(src, "w"), open(dst, "w"): - pass - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - _duplicate_with_warning(src, dst) - - self.assertIn("File already exists", str(w[-1].message)) - def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") diff --git a/q2_types/tests/test_util.py b/q2_types/tests/test_util.py index 0c495eaf..4635350c 100644 --- a/q2_types/tests/test_util.py +++ b/q2_types/tests/test_util.py @@ -6,14 +6,17 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import warnings from pathlib import Path from q2_types.kraken2 import Kraken2OutputDirectoryFormat from qiime2.plugin import model from qiime2.plugin.testing import TestPluginBase -from q2_types._util import _validate_num_partitions, _validate_mag_ids, \ - FileDictMixin +from q2_types._util import ( + _validate_num_partitions, _validate_mag_ids, FileDictMixin, + _duplicate_with_warning +) class TestUtil(TestPluginBase): @@ -53,6 +56,19 @@ def test_validate_mag_ids_invalid(self): [(0, "a"), (0, "a"), (0, "c"), (0, "d"), (0, "e"), (0, "f")] ) + def test_duplicate_warning(self): + tmpdir = self.temp_dir.name + src = os.path.join(tmpdir, "file.txt") + dst = os.path.join(tmpdir, "file_copy.txt") + with open(src, "w"), open(dst, "w"): + pass + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + _duplicate_with_warning(src, dst) + + self.assertIn("File already exists", str(w[-1].message)) + class TestFileDictMixin(TestPluginBase): package = "q2_types.tests" From d5c8c3d2db15746e297344815f0ce89760f5f4a2 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 13 Oct 2025 12:51:56 +0200 Subject: [PATCH 14/15] Lint this, lint that --- q2_types/_util.py | 39 ++++++++++++++++++- q2_types/genome_data/_methods.py | 44 ++-------------------- q2_types/genome_data/tests/test_methods.py | 12 +++--- q2_types/plugin_setup.py | 4 +- 4 files changed, 51 insertions(+), 48 deletions(-) diff --git a/q2_types/_util.py b/q2_types/_util.py index 150c04dc..f7e54f80 100644 --- a/q2_types/_util.py +++ b/q2_types/_util.py @@ -7,10 +7,11 @@ # ---------------------------------------------------------------------------- import gzip import itertools +import os import re import warnings from collections import defaultdict -from typing import List +from typing import List, TypeVar import skbio import pandas as pd @@ -19,6 +20,8 @@ from qiime2.plugin import ValidationError from qiime2.util import duplicate +DirFmt = TypeVar("DirFmt", bound=model.DirectoryFormat) + def read_from_fasta(path, constructor=skbio.DNA, lowercase=False): return skbio.read(path, format='fasta', constructor=constructor, @@ -244,3 +247,37 @@ def _duplicate_with_warning(src, dst): f"Skipping {src}. File already " f"exists in the destination directory." ) + + +def _collate_helper(dir_fmts: List[DirFmt]) -> DirFmt: + """ + Iterates through a list of directory formats, merging their contents + into a single directory. Can be used with per sample directories and + without. Handles duplicate files by issuing warnings when conflicts occur. + + Parameters: + dir_fmts (iterable): + A List of directory format objects to be collated. + + Returns: + object: + The updated `collated` directory format object containing all + merged files and subdirectories. + """ + # Initialize the collated directory format with the same class as inputs + collated = dir_fmts[0].__class__() + + for dir_fmt in dir_fmts: + for item in dir_fmt.path.iterdir(): + target = collated.path / item.name + # Per sample directories + if item.is_dir(): + target.mkdir(exist_ok=True) + for file in item.iterdir(): + _duplicate_with_warning(file, target / file.name) + # Non per sample directories + else: + _duplicate_with_warning( + item, collated.path / os.path.basename(item) + ) + return collated diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index ae8a1033..dbddf266 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -9,57 +9,21 @@ import os import shutil import warnings -from typing import Union, TypeVar, List +from typing import Union from warnings import warn import numpy as np import skbio -from qiime2.plugin import model from qiime2.util import duplicate -from q2_types._util import _duplicate_with_warning +from q2_types._util import _collate_helper from q2_types.feature_data import DNAIterator, DNAFASTAFormat from q2_types.genome_data import ( SeedOrthologDirFmt, OrthologAnnotationDirFmt, LociDirectoryFormat, - GenomeSequencesDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat + GenomeSequencesDirectoryFormat, GenesDirectoryFormat, + ProteinsDirectoryFormat ) -DirFmt = TypeVar("DirFmt", bound=model.DirectoryFormat) - - -def _collate_helper(dir_fmts: List[DirFmt]) -> DirFmt: - """ - Iterates through a list of directory formats, merging their contents - into a single directory. Can be used with per sample directories and - without. Handles duplicate files by issuing warnings when conflicts occur. - - Parameters: - dir_fmts (iterable): - A List of directory format objects to be collated. - - Returns: - object: - The updated `collated` directory format object containing all - merged files and subdirectories. - """ - # Initialize the collated directory format with the same class as inputs - collated = dir_fmts[0].__class__() - - for dir_fmt in dir_fmts: - for item in dir_fmt.path.iterdir(): - target = collated.path / item.name - # Per sample directories - if item.is_dir(): - target.mkdir(exist_ok=True) - for file in item.iterdir(): - _duplicate_with_warning(file, target / file.name) - # Non per sample directories - else: - _duplicate_with_warning( - item, collated.path / os.path.basename(item) - ) - return collated - def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: return _collate_helper(dir_fmts=loci) diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 8e6c4dfe..ed8e0a57 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -16,15 +16,15 @@ from q2_types.feature_data import DNAFASTAFormat from q2_types.genome_data import ( - SeedOrthologDirFmt, OrthologAnnotationDirFmt,collate_ortholog_annotations, - GenomeSequencesDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat, - LociDirectoryFormat + SeedOrthologDirFmt, OrthologAnnotationDirFmt, + GenomeSequencesDirectoryFormat, GenesDirectoryFormat, + ProteinsDirectoryFormat, LociDirectoryFormat ) from q2_types.genome_data._methods import ( - collate_loci, collate_genomes, _collate_helper, - collate_orthologs, partition_orthologs, collate_genes, collate_proteins + collate_loci, collate_genomes, collate_orthologs, partition_orthologs, + collate_genes, collate_proteins, collate_ortholog_annotations ) -from q2_types._util import _duplicate_with_warning +from q2_types._util import _collate_helper class TestPartitionCollating(TestPluginBase): diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 389c0f7c..f7068f00 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -28,7 +28,9 @@ from q2_types.genome_data import ( Orthologs, GenomeData, NOG, Loci, DNASequence, Genes, Proteins ) -from q2_types.genome_data._methods import collate_loci, collate_genes, collate_proteins +from q2_types.genome_data._methods import ( + collate_loci, collate_genes, collate_proteins +) from q2_types.sample_data import SampleData from q2_types.kraken2 import Kraken2Reports, Kraken2Outputs From 6b50ac8cbc517c93d0139e4c740bdf9bc5d6ecc6 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 13 Oct 2025 13:21:34 +0200 Subject: [PATCH 15/15] More lint --- q2_types/genome_data/_methods.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index dbddf266..82be1c01 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -35,8 +35,9 @@ def collate_ortholog_annotations( return _collate_helper(dir_fmts=ortholog_annotations) -def collate_genes(genes: GenesDirectoryFormat) -> ( - GenesDirectoryFormat): +def collate_genes( + genes: GenesDirectoryFormat +) -> GenesDirectoryFormat: return _collate_helper(dir_fmts=genes)