diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py index 1de55bd7..e9a54c9e 100644 --- a/q2_types/genome_data/__init__.py +++ b/q2_types/genome_data/__init__.py @@ -17,7 +17,7 @@ GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG ) from ._methods import collate_orthologs, partition_orthologs, \ - collate_ortholog_annotations + collate_ortholog_annotations, collate_genomes __all__ = [ 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', @@ -25,5 +25,6 @@ 'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs', 'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence', 'OrthologAnnotationDirFmt', 'NOG', - 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations" + 'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations", + 'collate_genomes' ] diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 9fdce2dc..23dff04b 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -7,13 +7,19 @@ # ---------------------------------------------------------------------------- import glob import os +import shutil import warnings +from typing import Union +from warnings import warn import numpy as np +import skbio from qiime2.util import duplicate +from q2_types.feature_data import DNAIterator, DNAFASTAFormat from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt, - LociDirectoryFormat) + LociDirectoryFormat, + GenomeSequencesDirectoryFormat) def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: @@ -112,3 +118,51 @@ def collate_ortholog_annotations( duplicate(fp, collated_annotations.path / fp.name) return collated_annotations + + +def collate_genomes( + genomes: Union[DNAFASTAFormat, GenomeSequencesDirectoryFormat], + on_duplicates: str = "warn", +) -> GenomeSequencesDirectoryFormat: + genomes_dir = GenomeSequencesDirectoryFormat() + error_on_duplicates = True if on_duplicates == "error" else False + ids = set() + duplicate_ids = set() + msg = "Duplicate sequence files were found for the following IDs: {}." + if isinstance(genomes[0], DNAFASTAFormat): + for genome_file in genomes: + for genome in genome_file.view(DNAIterator): + fn = genome.metadata["id"] + if fn not in ids: + with open(os.path.join(genomes_dir.path, fn + ".fasta"), + "w") as f: + skbio.io.write(genome, format="fasta", into=f) + ids.add(fn) + else: + duplicate_ids.add(fn) + if error_on_duplicates: + raise ValueError(msg.format(", ".join(duplicate_ids))) + + else: + for genome in genomes: + for fp in genome.path.iterdir(): + fn = os.path.basename(fp) + if fn not in ids: + shutil.copyfile( + fp, + os.path.join(genomes_dir.path, fn), + ) + ids.add(fn) + else: + duplicate_ids.add(fn) + if error_on_duplicates: + raise ValueError(msg.format(", ".join(duplicate_ids))) + + if duplicate_ids: + warn( + msg.format(", ".join(sorted(duplicate_ids))) + + " The latest occurrence will overwrite all previous " + "occurrences for each corresponding ID." + ) + + return genomes_dir diff --git a/q2_types/genome_data/tests/data/dna-fasta-format/dna-sequences1.fasta b/q2_types/genome_data/tests/data/dna-fasta-format/dna-sequences1.fasta new file mode 100644 index 00000000..d2d069df --- /dev/null +++ b/q2_types/genome_data/tests/data/dna-fasta-format/dna-sequences1.fasta @@ -0,0 +1,4 @@ +>ref1 d_Bacteria_1 +ACGTACGT +>ref2 d_Bacteria_2 +CGTCGTCC diff --git a/q2_types/genome_data/tests/data/dna-fasta-format/dna-sequences2.fasta b/q2_types/genome_data/tests/data/dna-fasta-format/dna-sequences2.fasta new file mode 100644 index 00000000..a51c59fe --- /dev/null +++ b/q2_types/genome_data/tests/data/dna-fasta-format/dna-sequences2.fasta @@ -0,0 +1,4 @@ +>ref5 d_Bacteria_3 +ACGTACGT +>ref6 d_Bacteria_4 +CGTCGTCC diff --git a/q2_types/genome_data/tests/data/genes/genes1.fa b/q2_types/genome_data/tests/data/genes/genes1.fa old mode 100644 new mode 100755 diff --git a/q2_types/genome_data/tests/data/genes/genes2.fa b/q2_types/genome_data/tests/data/genes/genes2.fa old mode 100644 new mode 100755 diff --git a/q2_types/genome_data/tests/data/genes_samples/sample1/genes1.fa b/q2_types/genome_data/tests/data/genes_samples/sample1/genes1.fa old mode 100644 new mode 100755 diff --git a/q2_types/genome_data/tests/data/genes_samples/sample2/genes2.fa b/q2_types/genome_data/tests/data/genes_samples/sample2/genes2.fa old mode 100644 new mode 100755 diff --git a/q2_types/genome_data/tests/data/genomes-dir-format1/ref1.fasta b/q2_types/genome_data/tests/data/genomes-dir-format1/ref1.fasta new file mode 100644 index 00000000..a5b817f7 --- /dev/null +++ b/q2_types/genome_data/tests/data/genomes-dir-format1/ref1.fasta @@ -0,0 +1,2 @@ +>ref1 +ACGTTACGT diff --git a/q2_types/genome_data/tests/data/genomes-dir-format1/ref2.fasta b/q2_types/genome_data/tests/data/genomes-dir-format1/ref2.fasta new file mode 100644 index 00000000..0f9d0e9e --- /dev/null +++ b/q2_types/genome_data/tests/data/genomes-dir-format1/ref2.fasta @@ -0,0 +1,2 @@ +>ref2 +ACGGGTACT diff --git a/q2_types/genome_data/tests/data/genomes-dir-format2/ref3.fasta b/q2_types/genome_data/tests/data/genomes-dir-format2/ref3.fasta new file mode 100644 index 00000000..69dc9385 --- /dev/null +++ b/q2_types/genome_data/tests/data/genomes-dir-format2/ref3.fasta @@ -0,0 +1,2 @@ +>ref3 +ACGTTACGT diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 754bd5f1..bfb7c99c 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -9,12 +9,17 @@ import os import warnings +import skbio from qiime2.plugin.testing import TestPluginBase +from qiime2.plugins import types + +from q2_types.feature_data import DNAFASTAFormat from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ - partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations + partition_orthologs, OrthologAnnotationDirFmt, \ + collate_ortholog_annotations, GenomeSequencesDirectoryFormat from q2_types.genome_data import LociDirectoryFormat -from q2_types.genome_data._methods import collate_loci +from q2_types.genome_data._methods import collate_loci, collate_genomes class TestOrthologsPartitionCollating(TestPluginBase): @@ -100,3 +105,167 @@ def test_collate_ortholog_annotations(self): compare.common, [f"{letter}.annotations" for letter in ["a", "b", "c"]] ) + + def test_collate_genomes_dnafastaformat_single(self): + self.helper_test_collate_genomes_dnafastaformat("single") + + def test_collate_genomes_dnafastaformat_multiple(self): + self.helper_test_collate_genomes_dnafastaformat("multiple") + + def helper_test_collate_genomes_dnafastaformat(self, input): + genomes1 = DNAFASTAFormat( + self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r" + ) + genomes2 = DNAFASTAFormat( + self.get_data_path("dna-fasta-format/dna-sequences2.fasta"), "r" + ) + if input == "single": + genomes = [genomes1] + content = { + "ref1": {"description": "d_Bacteria_1", + "sequence": "ACGTACGT"}, + "ref2": {"description": "d_Bacteria_2", + "sequence": "CGTCGTCC"}, + } + exp_files = ["ref1.fasta", "ref2.fasta"] + else: + genomes = [genomes1, genomes2] + content = { + "ref1": {"description": "d_Bacteria_1", + "sequence": "ACGTACGT"}, + "ref2": {"description": "d_Bacteria_2", + "sequence": "CGTCGTCC"}, + "ref5": {"description": "d_Bacteria_3", + "sequence": "ACGTACGT"}, + "ref6": {"description": "d_Bacteria_4", + "sequence": "CGTCGTCC"}, + } + exp_files = [ + "ref1.fasta", "ref2.fasta", "ref5.fasta", "ref6.fasta" + ] + + collated_genomes = collate_genomes(genomes=genomes) + actual_files = sorted(os.listdir(collated_genomes.path)) + self.assertEqual(actual_files, exp_files) + + for fn in actual_files: + fp = os.path.join(collated_genomes.path, fn) + with open(fp, "r") as fasta_file: + for seq in skbio.io.read(fasta_file, "fasta"): + actual_id = seq.metadata["id"] + actual_description = seq.metadata["description"] + actual_sequence = str(seq) + expected_id = fn.split(".")[0] + expected_desc = content[expected_id]["description"] + expected_sequence = content[expected_id]["sequence"] + + self.assertEquals(actual_id, expected_id) + self.assertEqual(actual_description, expected_desc) + self.assertEqual(actual_sequence, expected_sequence) + + def test_collate_genomes_genome_dir_multiple(self): + genomes1 = GenomeSequencesDirectoryFormat( + self.get_data_path("genomes-dir-format1"), "r" + ) + genomes2 = GenomeSequencesDirectoryFormat( + self.get_data_path("genomes-dir-format2"), "r" + ) + genomes = [genomes1, genomes2] + collated_genomes = collate_genomes(genomes=genomes) + exp_files = ["ref1.fasta", "ref2.fasta", "ref3.fasta"] + actual_files = sorted(os.listdir(collated_genomes.path)) + self.assertEqual(exp_files, actual_files) + + def test_collate_genomes_mix(self): + # should throw TypeError + genomes1 = DNAFASTAFormat( + self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r" + ) + genomes2 = GenomeSequencesDirectoryFormat( + self.get_data_path("genomes-dir-format2"), "r" + ) + genomes = [genomes2, genomes1] + with self.assertRaises(TypeError): + types.methods.collate_genomes(genomes=genomes) + + def test_collate_genomes_duplicates_warn_genome(self): + self.helper_test_collate_genomes_duplicates_warn("GenomeData") + + def test_collate_genomes_duplicates_warn_dna(self): + self.helper_test_collate_genomes_duplicates_warn("DNAFASTAFormat") + + def helper_test_collate_genomes_duplicates_warn(self, dir_fmt): + duplicate_ids = ( + ["ref1.fasta", "ref2.fasta"] + if dir_fmt == "GenomeData" + else ["ref1", "ref2"] + ) + warn_msg = ( + "Duplicate sequence files were found for the following IDs: {}. " + "The latest occurrence will overwrite all previous occurrences " + "for each corresponding ID." + ).format(", ".join(duplicate_ids)) + if dir_fmt == "GenomeData": + genomes1 = GenomeSequencesDirectoryFormat( + self.get_data_path("genomes-dir-format1"), "r" + ) + else: + genomes1 = DNAFASTAFormat( + self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), + "r" + ) + with warnings.catch_warnings(record=True) as w: + collated_genomes = collate_genomes(genomes=[genomes1, genomes1]) + exp_files = ["ref1.fasta", "ref2.fasta"] + actual_files = sorted(os.listdir(collated_genomes.path)) + self.assertEqual(actual_files, exp_files) + self.assertEqual(warn_msg, str(w[0].message)) + + if dir_fmt == "DNAFASTAFormat": + content = { + "ref1": {"description": "d_Bacteria_1", + "sequence": "ACGTACGT"}, + "ref2": {"description": "d_Bacteria_2", + "sequence": "CGTCGTCC"}, + } + + for fn in actual_files: + fp = os.path.join(collated_genomes.path, fn) + with open(fp, "r") as fasta_file: + for seq in skbio.io.read(fasta_file, "fasta"): + actual_id = seq.metadata["id"] + actual_description = seq.metadata["description"] + actual_sequence = str(seq) + expected_id = fn.split(".")[0] + expected_desc = content[expected_id]["description"] + exp_sequence = content[expected_id]["sequence"] + + self.assertEquals(actual_id, expected_id) + self.assertEqual(actual_description, expected_desc) + self.assertEqual(actual_sequence, exp_sequence) + + def test_collate_genomes_duplicates_error_genome(self): + self.helper_test_collate_genomes_duplicates_error("GenomeData") + + def test_collate_genomes_duplicates_error_dna(self): + self.helper_test_collate_genomes_duplicates_error("DNAFASTAFormat") + + def helper_test_collate_genomes_duplicates_error(self, dir_fmt): + duplicate_ids = ["ref3.fasta"] if dir_fmt == "GenomeData" else ["ref1"] + error_msg = ( + "Duplicate sequence files were found for the " + "following IDs: %s." % ", ".join(duplicate_ids) + ) + if dir_fmt == "GenomeData": + genomes1 = GenomeSequencesDirectoryFormat( + self.get_data_path("genomes-dir-format2"), "r" + ) + else: + genomes1 = DNAFASTAFormat( + self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), + "r" + ) + with self.assertRaisesRegex(ValueError, error_msg): + collate_genomes( + genomes=[genomes1, genomes1], on_duplicates="error" + ) diff --git a/q2_types/per_sample_sequences/__init__.py b/q2_types/per_sample_sequences/__init__.py index 56782b11..0b4bf4c0 100644 --- a/q2_types/per_sample_sequences/__init__.py +++ b/q2_types/per_sample_sequences/__init__.py @@ -31,7 +31,8 @@ JoinedSequencesWithQuality, MAGs, Contigs, SingleBowtie2Index, MultiBowtie2Index, AlignmentMap, MultiAlignmentMap) -from ._methods import partition_sample_data_mags, collate_sample_data_mags +from ._methods import partition_sample_data_mags, collate_sample_data_mags, \ + collate_contigs, partition_contigs from ._partitioners import partition_samples_paired, partition_samples_single @@ -55,5 +56,6 @@ 'BAMFormat', 'BAMDirFmt', 'MultiBAMDirFmt', 'MultiFASTADirectoryFormat', 'AlignmentMap', 'MultiAlignmentMap', 'partition_sample_data_mags', 'collate_sample_data_mags', - 'partition_samples_single', 'partition_samples_paired' + 'partition_samples_single', 'partition_samples_paired', + 'collate_contigs', 'partition_contigs' ] diff --git a/q2_types/per_sample_sequences/_methods.py b/q2_types/per_sample_sequences/_methods.py index 6c67ae3c..a5d37740 100644 --- a/q2_types/per_sample_sequences/_methods.py +++ b/q2_types/per_sample_sequences/_methods.py @@ -13,7 +13,8 @@ from qiime2.util import duplicate from q2_types._util import _validate_num_partitions -from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt +from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt, \ + ContigSequencesDirFmt def partition_sample_data_mags( @@ -98,3 +99,45 @@ def collate_sample_data_mags( dest.write(line) return collated_mags + + +def partition_contigs( + contigs: ContigSequencesDirFmt, num_partitions: int = None +) -> ContigSequencesDirFmt: + partitioned_contigs = {} + contigs = [ + (sample_id, sample_fp) for sample_id, sample_fp in + contigs.sample_dict().items() + ] + num_samples = len(contigs) + num_partitions = _validate_num_partitions( + num_samples, num_partitions, "sample" + ) + + contigs = np.array_split(contigs, num_partitions) + for i, samples in enumerate(contigs, 1): + result = ContigSequencesDirFmt() + + for sample_id, sample_fp in samples: + duplicate(sample_fp, result.path / os.path.basename(sample_fp)) + + # If num_partitions == num_samples we will only have gone through one + # sample in the above loop and will use its id as a key. Otherwise we + # may have gone through multiple samples in the above loop and will be + # using indices for keys + if num_partitions == num_samples: + partitioned_contigs[sample_id] = result + else: + partitioned_contigs[i] = result + + return partitioned_contigs + + +def collate_contigs(contigs: ContigSequencesDirFmt) -> ContigSequencesDirFmt: + collated_contigs = ContigSequencesDirFmt() + + for contig in contigs: + for fp in contig.path.iterdir(): + duplicate(fp, collated_contigs.path / fp.name) + + return collated_contigs diff --git a/q2_types/per_sample_sequences/tests/data/contigs_partitioned/1/sample1_contigs.fa b/q2_types/per_sample_sequences/tests/data/contigs_partitioned/1/sample1_contigs.fa new file mode 100644 index 00000000..484d44ec --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/contigs_partitioned/1/sample1_contigs.fa @@ -0,0 +1,51 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAACTTTCCCATGCTCATTTTCTGCTTCAATCAATGACAATACATCTTCGCCTGTG +AGCGGCTCGAAATATAATCTGTCAGAGGTATCATAATCCGTTGAAACGGTTTCAGGATTA +CAATTAACCATGATTGTTTCATAACCCGCCTCTTTTAGTGCATAGGCGGCATGGACACAG +CAATAATCAAATTCAATACCTTGCCCGATACGGTTTGGCCCGCCACCTAGAATAACGATT +TTGTCTTTTTTAGTTGCTGTAATTTCAGAAGTAGAATTAAGTGTTTCATAGGTGCCGTAC +ATATACGATGTTAATGACGGTATTTCTGCCGCACAGCTATCCACCCGCTTATAAACAGGT +TTTACTTTGTGCATCAAACGTGTTTTACGGATCGTTGCTTCTGCAACCCCTACCAATTCA +GCCAGACGCGCATCCGAAAATCCTGCGCGCTTCAATGCCATCCATCCCTGAGGATCTTTC +GGCAGGCCGTTTTTCTTAATGGAGGCCTCAGTATCAATAAGAGATTTTATACGCTCTAAA +TACCACATATCAAATTTTGTTAATTGATAGATAGTTTCTAAATCCATACCGTGTCGCATC +GCTTCGGCTGCATAGAGTAAGCGGGCTGGCGTTGGACGTGAAAGTGCTGCCCGAATATCG +TCCATATCAGGCTCAGACTTACCAGCAATCGGAATGGAGCTAAGCCCCTCTAAGCCCTTT +TCTAAAGAGCGCAAAGCTTTTTGCAGAGACTCTTCGAAGCTACGCCCTATAGCCATGGCT +TCACCGACTGACTTCATTGCTGTGGTTAAGGTGTTATCAGAGCCTTTAAATTTCTCGAAA +GCAAAACGAGGCACTTTTGTCACGACATAATCAATGGATGGCTCAAAGGCTGCGGGTGTT +TTGCCGCCTGTAATATCATTGCCTAATTCATCAAGTGTATACCCTACCGCCAATTTCGCT +GCCACTTTAGCAATCGGAAAACCTGTAGCTTTTGAGGCTAAAGCAGAAGAACGAGACACA +CGAGGGTTCATCTCAATCACCACCATACGGCCTGTCTCTGGATCCATTCCAAATTGGACA +TTCGATCCACCTGTTTCAACACCAATCACACGAAGTACGGCCAATGAGGCATTGCGCATG +ATTTGATACTCTTTATCTGTCAGTGTTAAGGCTGGAGCAACGGTAATAGAATCACCTGTA +TGCACGCCCATAGGGTCAATGTTTTCAATCGAACAAATAATGATAGCGTTGTCCTTTGTA +TCACGAACAACCTCCATCTCGTATTCTTTCCAACCCAATAAACTCTCATCAATCAACACT +TCATTGGTTGGTGACGCATCCAAGCCTTCACGAATGATTTGTTCAAACTCATCTTTGTTA +TAAGCAACCCCGCCACCAGAACCACCCATGGTAAAGGATGGACGAATAATCGCTGGTAAG +CCTGTATGTTTCAGAGCCTCTCTAGCCTCTTCCATAGAATGCACCACCGCACTTTTAGGA +CTTTCAAGACCAATCTTCTCCATACAATCTTTAAATAATTGGCGGTCTTCAGCCTTTTCA +ATGGCTTCTTTATTGGCACCGATCAGTTCAATATTGAGTCTTTTTAATACACCCATTTTA +TCAAGAGCCAGTGCAGCATTCAGTGCCGTCTGACCACCCATGGTTGGAAGCAACGCATCG +GGGCGTTCTTTTTCTAAAATCTTTGCGACAATTTCTGGGGTGATTGGCTCAATATAAGTC +GCATCAGCCATATTCGGATCAGTCATAATTGTGGCTGGATTAGAATTAATCAGGACAACG +CGGTACCCCTCTTCTTTCAGCGCTTTACAGGCTTGTGCACCTGAATAGTCAAATTCACAG +GCTTGACCTATCACGATAGGACCAGCGCCAATAATACAAATGGAGGAAATGTCGGTGCGT +TTAGGCATGTGAATCTCGGTTTCTTTTTTTTATACTTACCGAGAGTTAGTTTATGCACTT +ATCAGGGTGTGCAGACAAGCTCTTTCTTGACCTTACCCGCAAGTTTAGCTATATTCTATC +AACAGCCCGCCCTTGATGGCGGGTTATTTTATTGAAAAGGTGCAAGGCTATGCAAAAAAT +ACCCTTAACAAAACAAGGCCACACAGACCTTGAAGCAGAATTAAAAGATTTAAAACACCG +CCAACGTCCAGCGGTTATTGCTGCGATATCTGAAGCCAGAGAACATGGCGATTTATCAGA +AAACGCTGAATATCACGCCGCCCGTGAGCAGCAAAGCTTTATCGAAGGTCGTATCGAGCA +AGTCGAAGCTATTTTATCGCTCGCTGAGATTATTGACCCGGCCAAAATTTCTGGTGACAC +GGTAAAATTTGCAGCAACTGTTAAAGTCGTTGATTGTGACACAGATGATGAACATATCTA +CCAAATCGTCGGTGATGAAGAATCAGACATTGAAACAGGAAAACTGGCTATCTCGTCACC +TGTTGCCCGCGCTTTAATCGGCAAAAAAGTTGAGGACTCAGTCGAAGTCCGCACACCAAA +AGGCACAAGAGAATACGAAATTTTAGAAATTCTGTATAAGTAATTTCTATTCTTCGATCG +GTACGCCAGGCTTCTTGAAATTACGTTTCATAATAAGTGATGACTTAACAGAGCGAACAT +TTTTTAGCGCTGTCAGTTCTTCTGTAATAAAACGCTGATAAGCATCCCAATCTTTGGCCA +CAATACGGAGTGTGAAATCCATATCACCCGCAATCATGTAACAATCACGAACGAGATCCA +TTTTCTCAACGGCTTTGATAAAGGCCTGAAGGTCTTTTTCTGAAGTGTCTTCTAAAGCTA +CATTGGCAAAAACCGCCACACCATAGCCTAACATTGAAGCACTTAAATCCGCATGATAAC +TTTGGATATAACCATAATCTTCCAAT diff --git a/q2_types/per_sample_sequences/tests/data/contigs_partitioned/2/sample2_contigs.fa b/q2_types/per_sample_sequences/tests/data/contigs_partitioned/2/sample2_contigs.fa new file mode 100644 index 00000000..524cddb5 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/contigs_partitioned/2/sample2_contigs.fa @@ -0,0 +1,64 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTCTTTGAAGAGGGTGCGAAGGGGCTCGACGAGGTCGAACTGGAGGTCTTC +GGGCAGCCCACCGACGTTGTGGTGGCTCTTGATGTTGGCGGTTCCAGCCCCGCCACCGGA +CTCGACGACATCCGGATACAGGGTGCCCTGCACGAGGAAGCGGATGGGTTCGCCGTCGGC +CTTGGCCTCATCCACGAGCTCGCGCTGCACCCGCTCGAACGCACGGATGAACTCGCGACC +GATGATCTTGCGCTTCTCTTCGGGATCGCTGACGCCGGCGAGGGCCTCGAGGAACGTCTC +GCGGGCGTCGACGGTCACGAGGCGCACACCGGTCGAGGCTACGTAATCCTGCTCGACCTG +TTCGCGTTCGCCCTTGCGCAGCAGGCCGTGGTCGACGAACACGGCAACGAGCTGGTCGCC +GACTGCCTTGTGAACGAGGGCCGTCGAGACAGCCGAGTCGACTCCGCCCGACAGCGCCGA +GAGCACACGACCCGAGCCGACCTGCGCGCGGATCCGATCGACCTGCTCGGCGATGACGTT +GCCGCTGTTCCAGTCTGCGGGGAGGCCCGCAGCCTTGTGCAGGAAGTTCTCGATGATGCG +CTGCCCGTGGTCGGAGTGCTTGACCTCGGGATGCCACTGCACACCGTACATGCGGCGAGC +GTCGTTGCCGAAAGCGGCGACCGGGGTGGCACCGGTGCGGGCGAGCACCTCGAACCCGGC +GGGGGCTTCGGACACCTGGTCACCATGGCTCATCCAGACGTTCTGCTCCGCGGGCTGGCC +ATCGAACAGTACGCTCTCGTCACGGATGATGCTGGCGTCAGTCGCCCCGTACTCGCGCAG +CCCCGTGTTCGCAACGACGCCACCGAGCGCCTGCGCCATGACCTGGAATCCGTAGCAGAT +GCCAAGGGTCGGAACGCCCAGGTCGAACACCGCCGGGTCGAGCGTCGGCGCGCCAGGCTC +GTACACCGATGACGGTCCGCCCGACAGGATGATGCCGATCGGATCTTTTGCGGCAATCTC +TTCAGCTGTCGCGGTGTGCGGAACCAGCTCGCTGTAGACGCCCGCTTCGCGCACGCGACG +GGCAATGAGCTGGGCGTACTGCGCGCCGAAGTCGACGACGAGGACGGGTCGCTGCGAGGT +CTCGGTCTGTTCTGTCACCGGATGCTTTCGGTCGGCGCCCCTGGAACCCAGGAGCGAAGG +TCAGGACACTGTGGGGTTCTGGCGGGTCACGCTGGAGTGTTCGGCGAGATCGTGGTTCTC +GGACTCGCGCGCAGCAAGGTACGTCTTGACCTCACGGGCGACCCGTGCCTCCATGAAGAA +CGACAGGAACGGGACGATTCCGCCCAGCGCGAGGGCGATGAACCGACCGAACCGCCACCG +CATCAGGCTCCAGATGCGGAAGCACGCGAAGAGGTACACGACGTAGAACCAGCCGTGGCC +GACGAGGATCGACAGCGACACATTGACGCCGTCGCCCGCCGACTCGAGGTCGCAGCCCAG +ACCCCCGGGCACGAAGAGCGAGTACCACTCGCATCCGGGCCCGACCAGCACCGGTGCGAA +CCAGAGGAAGCCACCGGACCCGCCGGCGAACAGTTCGACGTGCAGCGGCGAGTACTTGAG +GATCATCTCGGCCAGCAGCAGGAGCAGCATGACACCGGTGATGATCGAGGCGACCTGGTA +GAAGGTCAAGGCTCCGCGAATGGCCGGGAAAGACGACGGTTTCGGCTCACGGGGCATGGG +CCCATTCTAGTCGCCGGTTGCGGTCGCGCTTCCCGACGAGGATGCCGCGGCTGCGGCATC +CTCGAGCTCTTCGACTTCCTTCTCCCACGCATCCTTGGCGAGGCGGTACCAGAAATAGAA +GGCGAAGCCGGCGAAGACCACCCACTCGGCGGCGTAGAAGATGTTCAGCCAGTTGACTGT +GGACCCGGCATCCGGCGCGGGCGAGGCGATGTCCACAAGGCCCGCCGGCGCAGACTGCGA +GGCGATGTAGCTGCGATAGACGTCCAGGCCCGCGGTGTCGTGCCACTGCGACAGGAGCGC +CGCCGGCGACATCCGTGTCATCGTGAACGGCGGCTCGCCGCGCGGCGGCGGCACCGGACC +CTCGTCCGAGATCAACCGACCGACGACCGTCACGGACTCCCCCGCGACCGCAGTCTGCTC +GAGCGCCTCGGCGGCGGATTCGGCGACGGTGAGCGTCGGCGCCCAGCCGACGGCGACGGC +CACGGATGTCGGCGTCGCGGTGTCGGCGATACGCAGCTGACCGGTGACCCAGAAGCCTTC +GACGCCGTCGTTGAAGCGCGACGAAACGACAAGGAAATCCTCGGGAACCCACGTGCCCGT +CACCTCGACGCGCTGGCCCACGAGCGGCTCGGGAAGGTACTCGCCGGGCCCGGCGATCTC +GGCGAGCGGCCTGACCTCTTCGGTGGTCCCGGGCGGGAGTGGGTCGGTGTCGATAGCGCG +CGAGAGCTGCCACTGCCCGAGCCACGCGAACACCCCCGCTACGACGAGCGCGAGCAGCAG +GACGCCGATCCAGCGGGGTCGGAGCATGACCTCCCGCAGGGTCGGGGGAAAGACTGTCTG +GTCTGTCATCCGCCCGTATACGGCGCGACGACCACCTCGACGCGCTGGAACTCCTTGAGA +TCGGAGTACCCGGTCGTGGCCATCGACTTCTTCAGCGCCCCGATCAGGTTCGCGGTTCCG +TCAGCCACCGGAGCCGGACCGTAGAGCACGGATTCGAGGTTCGTCACCTGATCCACCTTC +ACGCGGCGACCGCGCGGGAGCTTCGAGTGGTGAGCCTCCGGCCCCCAGTGGTATCCACGA +CCGGGGGCGTCGGTTGCCCGCGAGAGAGCGACGCCGAGCATGACGGCATCCGCTCCCATC +GCGAGCGCCTTGACGATGTCGCCTGACGTTCCCACACCGCCATCGGCGATGACGTGGACG +TAGCGCCCGCCCGACTCGTCGAGGTAGTCGCGGCGCGCGCCGGCGACGTCGGCTACCGCC +GTGGCCATCGGGGCGTGGATGCCGAGAACCCCGCGCGTCGTCGAGGCTGCGCCCCCGCCG +AAGCCGACGAGCACGCCCGCGGCGCCCGTGCGCATGAGGTGCAGGGCTGCCGTGTAGGTC +GCAGCACCGCCGACGATGACAGGCACGTCGAGGTCGTAGATGAACTTCTTGAGGTTGAGG +GGCTCGTCGACGCTCGAGACGTGCTCGGCCGAAACCGTCGTGCCACGGATGACGAACAGG +TCCACACCCGCGGCGACCACGGTTTCGTACAGCTGCTGGGTGCGCTGCGGAGTCAAAGCA +CCGGCCACCGTGACTCCGGCGTCACGGATCTGCTGCAGTCGCTCACGGATGAGCTCGGGC +TTGATCGGCTCGGAGTAGAGCTCCTGCATCCGGCAGGTTGCCGTCGCCTCGTCGAGAGAC +GCGATCTCAGCCAGCAGCGGCTCGGGGTCGTCGTACCGGGTCCAGAGCCCCTCGAGATCG +AGGACACCGAGTCCGCCGAGCTGACCGAGCATGATCGCCGTCTGCGGGCTCACAACCGAA +TCCATCGGGGCGCCGAGCACCGGGATGTCGAACTGGAACGCGTCGATCGACCATGCGGTC +GAGACATCCTCGGGATTGCGGGTGCGGCGCGAGGGAACGACGGCGATGTCGTCGAACGAG +TACGCGCGGCGAGCCCGCTTGGCGCGGCCGATCTCGATC diff --git a/q2_types/per_sample_sequences/tests/data/contigs_partitioned/2/sample3_contigs.fa b/q2_types/per_sample_sequences/tests/data/contigs_partitioned/2/sample3_contigs.fa new file mode 100644 index 00000000..489e644b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/contigs_partitioned/2/sample3_contigs.fa @@ -0,0 +1,46 @@ +>k129_6525 +AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATCAGTCAGCTCATTATTGAA +TCGGACTTCTGTCTCCAATCGATATATTGATGGAGAAGAAGGGCTCCATAAAAGAGGATT +ATTAAGTTCTAAATTAACCTCTGCTGTCTCTTTATTTTCCGCTTCTACATTAAGCGAATG +ATCCATCTGCAGTAATGGTTTATCGCCTTCATTAGAAAATATTTTAAATACTACAGTAAC +TGCGCTCCAATGATTATACGTATTAAAGACCTCTGCGCGTAACAATAATTCTACTTTATT +TAGGTCTTGCTGATCATGGAAATGGCAGCGCACTTGAGTTCCGCAGTAGGGCACATAGAC +GCAATTAGTAGCTATTAGTCGAACATCGCGGTAGATGCCGCCGCCCTCATACGACCATAG +TTCAAATTCTCTGGCATCGCAGCGAACCGCGACCACGTTCGGCACATTCGCATCGCAAAG +CTCAGTGATATCGAGAGTGAAACTGGTGTAGCCTGATAGATGCCGCCCAGCTAAATGGCC +ATTTACCCATATTGTTGCATCTCGGTAAATGCCATCAAATTCAAGGTGAATACGCTGCTT +GCTAGCTTCTTTGGGAATTTCAAACGTTTTGCGATACCAGCCTACATCAGTCGGCAGTGA +GCCATGCACAGCATTCGCGGATGCCCGAAATTCGCCTTCAATTACGAAATCATGAGGTAG +GTTTATGTCGCGCCATGCTTCATCTGGATAGCCCAAGCGCGCGACCCCATGGTTTCCTGC +CTTTAACCACTCGGCTCGCTTAAAACGATTAGCATGAATGGCTTGATGGTTAGTGCTGTC +TAGCTCACCTCGATGAAACTTCCAACCTTGGTTAAATTCATAAGTGGTACGCATAGAATT +ACTGATGTCTTTTAAAAGATTCTACAAGTGGAGTCTATTAATTATTTGATAAGTTACTCT +GATTATTTTTAGAGATTTCTAATACAACTCCGCTGCACGTGCCGTAACGTCCGCCTTGGT +ATGCGCAAAACAGATGGGTGGGGACGCCTTCAGAGTTAATCAGTAACTGCGGTCGTTCGA +ATCGCCCTTCACGATCCAGTCCAGGCAATGTCTCGTCGAAGTAAGTTCCAGCATCTTTGT +AAGCAACCTGCGGATTTTGCCAAATAAGGCCATCGCTGGATTCCATATACAGCCCATACT +CGTGGTTATAAAAACCCATGTCACGCATGATTATTTTATATGGTGCGCAATCCTTCGGTT +CGTACCATGTGTACGCGTC +>k129_6531 +TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATTTGGCGATAGCGAGCCCGT +ATTTACGGTTGCCGTTAAATCGCCGCCCAGTGTCCCAATCCCAACCTTTATAGTAGAGCC +AATATTCGCCATTAGGATTTTGTAATAGCGATGGGTTGCTAACGACGGCATCGTCCCAAT +CGCCATCGCTGCCAACATTAATAACAGGTTCATCTCCAACGCGTCGCCAAGGTCCATTTA +TATTATCGGCAATGGCTAAACCGATGCGTTTGGTGTAGACTAATTGATTGAAGTATTTTT +CGTACTCTGCAGTAGATAAATTGGGTAGCTCATTTTGCTCGATATCTAGTTTCGAGCCAT +CTGCTCCCATGTAGAAAAGAGCATACTTGTCGCCGACCTTTTGCACAGTCGGATTGTGGA +TTGCCCATGAGTCCCAAGCATTTGCACCGCTGCCTTTTAGAACGACTCCTAAATCTTCGT +AGGGGCCTTCCGGAAGATCAGCAACCGCATGGGCCACTTCGCAGGCACTTACCCAACCAG +AAAATGTATACTCGTTTCGCCAACGTGAGTAAAAAACATGAATGCGCCCGTCGGGTCCGT +AGATAGGCGAACAGCACCAAACATGATAGCCTTCTACTTCAAGAATTCGCCCCAGTGGTT +TGAGTTTTTGCTCGAAGTTCGAAGTGCTTACTTCAGAGGTGATGGGACGTAGCTTCTGTA +AATTAATGAGCGACTTATTGCTAACTGTAGAGTCCATGAAAAAAAGGTAAACTTTATACG +AGTAATGTTATGCTCCTTAAAACTGTCAAGGTTTAGGCATTTTGCTGAGCATTATGGTGT +TTAATGGGCTTGAATCATAACAGGATTAAGCGACATTTAAATATTAATGATAAGAATTAG +TGATATAGCTAAAGAGTTAGGGCTTTCGAGGGTTACAGTCTCGGCTATTTTAAATGGACG +ACACCAGAAAATAGGTATTTCCGAAAAGACCGCGCAAAGGGTTCGTTCGAGTGCAAAGGC +TATGGGTTATCTACCCAATCAGAATGCATTGAGTATGAAGAGAGGTCGAAGCATGACTAT +TGGTATGCTGAGTAGTGCGCTATCGGAGGAGTGGGGTGCTAAAATTCTTGTTGGTGCATT +AAGTGCGATAAAGAACACGCCTTATTCACTGCGCGTTGAGTCAGTACAGGGAGCAGCAGA +AGAGCGCGGTGCCCTAGAGCGCCTCTTGGGGTCACGAATTGAAGGGTTGTTGTGCTGCAA +TATAAAT diff --git a/q2_types/per_sample_sequences/tests/test_methods.py b/q2_types/per_sample_sequences/tests/test_methods.py index e4c898a1..e65ee757 100644 --- a/q2_types/per_sample_sequences/tests/test_methods.py +++ b/q2_types/per_sample_sequences/tests/test_methods.py @@ -6,14 +6,14 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import filecmp +import os from unittest.mock import patch from qiime2.plugin.testing import TestPluginBase -from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt -from q2_types.per_sample_sequences._methods import ( - partition_sample_data_mags, collate_sample_data_mags -) +from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt, \ + partition_contigs, collate_contigs, partition_sample_data_mags, \ + collate_sample_data_mags, ContigSequencesDirFmt class TestSampleDataMAGsPartitionCollating(TestPluginBase): @@ -109,3 +109,47 @@ def test_collate_sample_data_mags(self): f"{expected}/MANIFEST" ) ) + + @patch("q2_types._util._validate_num_partitions") + def test_partition_sample_data_contigs_2_partitions( + self, + mock_validate_num_partitions, + ): + p = self.get_data_path("contigs") + contigs = ContigSequencesDirFmt(path=p, mode="r") + partitioned = partition_contigs(contigs, 2) + + self.assertTrue( + os.path.exists(partitioned[1].path / "sample1_contigs.fa")) + self.assertTrue( + os.path.exists(partitioned[1].path / "sample2_contigs.fa")) + self.assertTrue( + os.path.exists(partitioned[2].path / "sample3_contigs.fa")) + + @patch("q2_types._util._validate_num_partitions") + def test_partition_sample_data_contigs( + self, + mock_validate_num_partitions, + ): + p = self.get_data_path("contigs") + contigs = ContigSequencesDirFmt(path=p, mode="r") + partitioned = partition_contigs(contigs) + + self.assertTrue( + os.path.exists(partitioned["sample1"].path / "sample1_contigs.fa")) + self.assertTrue( + os.path.exists(partitioned["sample2"].path / "sample2_contigs.fa")) + self.assertTrue( + os.path.exists(partitioned["sample3"].path / "sample3_contigs.fa")) + + def test_collate_sample_data_contigs(self,): + p1 = self.get_data_path("contigs_partitioned/1") + p2 = self.get_data_path("contigs_partitioned/2") + contigs1 = ContigSequencesDirFmt(path=p1, mode="r") + contigs2 = ContigSequencesDirFmt(path=p2, mode="r") + + collated = collate_contigs([contigs1, contigs2]) + + self.assertTrue(os.path.exists(collated.path / "sample1_contigs.fa")) + self.assertTrue(os.path.exists(collated.path / "sample2_contigs.fa")) + self.assertTrue(os.path.exists(collated.path / "sample3_contigs.fa")) diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index 515b35d1..62c26006 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -12,7 +12,7 @@ import qiime2.plugin from qiime2.core.type import ( - Int, Range, Collection, List, TypeMatch, Properties + Int, Range, Collection, List, TypeMatch, Properties, Str, Choices ) import q2_types @@ -23,9 +23,10 @@ from q2_types.per_sample_sequences import (MAGs, JoinedSequencesWithQuality, SequencesWithQuality, - PairedEndSequencesWithQuality) -from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci + PairedEndSequencesWithQuality, + Contigs) +from q2_types.feature_data import FeatureData, Sequence +from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci, DNASequence from q2_types.genome_data._methods import collate_loci from q2_types.sample_data import SampleData from q2_types.kraken2 import Kraken2Reports, Kraken2Outputs @@ -327,6 +328,56 @@ description="" ) +plugin.methods.register_function( + function=q2_types.per_sample_sequences.partition_contigs, + inputs={"contigs": SampleData[Contigs]}, + parameters={"num_partitions": Int % Range(1, None)}, + outputs={"partitioned_contigs": Collection[SampleData[Contigs]]}, + input_descriptions={"contigs": "The contigs to partition."}, + parameter_descriptions={ + "num_partitions": "The number of partitions to split the contigs" + " into. Defaults to partitioning into individual" + " samples." + }, + name="Partition contigs", + description="Partition contigs into individual samples or the number of " + "partitions specified.", +) + +plugin.methods.register_function( + function=q2_types.per_sample_sequences.collate_contigs, + inputs={"contigs": List[SampleData[Contigs]]}, + parameters={}, + outputs={"collated_contigs": SampleData[Contigs]}, + input_descriptions={"contigs": "A collection of contigs to be collated."}, + name="Collate contigs", + description="Takes a collection of SampleData[Contigs] and collates them " + "into a single artifact.", +) + +plugin.methods.register_function( + function=q2_types.genome_data.collate_genomes, + inputs={ + "genomes": List[FeatureData[Sequence]] | List[GenomeData[DNASequence]] + }, + parameters={"on_duplicates": Str % Choices(["error", "warn"])}, + outputs={"collated_genomes": GenomeData[DNASequence]}, + input_descriptions={"genomes": "A list of genomes to be collated."}, + parameter_descriptions={ + "on_duplicates": "Preferred behaviour when duplicated genome IDs " + 'are encountered: "warn" displays a warning and ' + "continues with the combination of the genomes " + 'while "error" raises an error and aborts further ' + "execution." + }, + output_descriptions={"collated_genomes": "The converted genomes."}, + name="Convert a list of FeatureData[Sequence] or a list of " + "GenomeData[DNASequence] to GenomeData[DNASequence].", + description="This method converts a list of FeatureData[Sequence] or a " + "list of GenomeData[DNASequence] to a GenomeData[DNASequence] " + "artifact.", +) + importlib.import_module('q2_types.bowtie2._deferred_setup') importlib.import_module('q2_types.distance_matrix._deferred_setup') importlib.import_module('q2_types.feature_data._deferred_setup')