Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions q2_types/genome_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG
)
from ._methods import collate_orthologs, partition_orthologs, \
collate_ortholog_annotations
collate_ortholog_annotations, collate_genomes

__all__ = [
'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format',
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
'OrthologAnnotationDirFmt', 'NOG',
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations"
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations",
'collate_genomes'
]
56 changes: 55 additions & 1 deletion q2_types/genome_data/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@
# ----------------------------------------------------------------------------
import glob
import os
import shutil
import warnings
from typing import Union
from warnings import warn

import numpy as np
import skbio
from qiime2.util import duplicate

from q2_types.feature_data import DNAIterator, DNAFASTAFormat
from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt,
LociDirectoryFormat)
LociDirectoryFormat,
GenomeSequencesDirectoryFormat)


def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat:
Expand Down Expand Up @@ -112,3 +118,51 @@ def collate_ortholog_annotations(
duplicate(fp, collated_annotations.path / fp.name)

return collated_annotations


def collate_genomes(
genomes: Union[DNAFASTAFormat, GenomeSequencesDirectoryFormat],
on_duplicates: str = "warn",
) -> GenomeSequencesDirectoryFormat:
genomes_dir = GenomeSequencesDirectoryFormat()
error_on_duplicates = True if on_duplicates == "error" else False
ids = set()
duplicate_ids = set()
msg = "Duplicate sequence files were found for the following IDs: {}."
if isinstance(genomes[0], DNAFASTAFormat):
for genome_file in genomes:
for genome in genome_file.view(DNAIterator):
fn = genome.metadata["id"]
if fn not in ids:
with open(os.path.join(genomes_dir.path, fn + ".fasta"),
"w") as f:
skbio.io.write(genome, format="fasta", into=f)
ids.add(fn)
else:
duplicate_ids.add(fn)
if error_on_duplicates:
raise ValueError(msg.format(", ".join(duplicate_ids)))

else:
for genome in genomes:
for fp in genome.path.iterdir():
fn = os.path.basename(fp)
if fn not in ids:
shutil.copyfile(
fp,
os.path.join(genomes_dir.path, fn),
)
ids.add(fn)
else:
duplicate_ids.add(fn)
if error_on_duplicates:
raise ValueError(msg.format(", ".join(duplicate_ids)))

if duplicate_ids:
warn(
msg.format(", ".join(sorted(duplicate_ids)))
+ " The latest occurrence will overwrite all previous "
"occurrences for each corresponding ID."
)

return genomes_dir
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ref1 d_Bacteria_1
ACGTACGT
>ref2 d_Bacteria_2
CGTCGTCC
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ref5 d_Bacteria_3
ACGTACGT
>ref6 d_Bacteria_4
CGTCGTCC
Empty file modified q2_types/genome_data/tests/data/genes/genes1.fa
100644 → 100755
Empty file.
Empty file modified q2_types/genome_data/tests/data/genes/genes2.fa
100644 → 100755
Empty file.
Empty file modified q2_types/genome_data/tests/data/genes_samples/sample1/genes1.fa
100644 → 100755
Empty file.
Empty file modified q2_types/genome_data/tests/data/genes_samples/sample2/genes2.fa
100644 → 100755
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ref1
ACGTTACGT
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ref2
ACGGGTACT
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ref3
ACGTTACGT
173 changes: 171 additions & 2 deletions q2_types/genome_data/tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@
import os
import warnings

import skbio
from qiime2.plugin.testing import TestPluginBase
from qiime2.plugins import types


from q2_types.feature_data import DNAFASTAFormat
from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \
partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations
partition_orthologs, OrthologAnnotationDirFmt, \
collate_ortholog_annotations, GenomeSequencesDirectoryFormat
from q2_types.genome_data import LociDirectoryFormat
from q2_types.genome_data._methods import collate_loci
from q2_types.genome_data._methods import collate_loci, collate_genomes


class TestOrthologsPartitionCollating(TestPluginBase):
Expand Down Expand Up @@ -100,3 +105,167 @@ def test_collate_ortholog_annotations(self):
compare.common,
[f"{letter}.annotations" for letter in ["a", "b", "c"]]
)

def test_collate_genomes_dnafastaformat_single(self):
self.helper_test_collate_genomes_dnafastaformat("single")

def test_collate_genomes_dnafastaformat_multiple(self):
self.helper_test_collate_genomes_dnafastaformat("multiple")

def helper_test_collate_genomes_dnafastaformat(self, input):
genomes1 = DNAFASTAFormat(
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r"
)
genomes2 = DNAFASTAFormat(
self.get_data_path("dna-fasta-format/dna-sequences2.fasta"), "r"
)
if input == "single":
genomes = [genomes1]
content = {
"ref1": {"description": "d_Bacteria_1",
"sequence": "ACGTACGT"},
"ref2": {"description": "d_Bacteria_2",
"sequence": "CGTCGTCC"},
}
exp_files = ["ref1.fasta", "ref2.fasta"]
else:
genomes = [genomes1, genomes2]
content = {
"ref1": {"description": "d_Bacteria_1",
"sequence": "ACGTACGT"},
"ref2": {"description": "d_Bacteria_2",
"sequence": "CGTCGTCC"},
"ref5": {"description": "d_Bacteria_3",
"sequence": "ACGTACGT"},
"ref6": {"description": "d_Bacteria_4",
"sequence": "CGTCGTCC"},
}
exp_files = [
"ref1.fasta", "ref2.fasta", "ref5.fasta", "ref6.fasta"
]

collated_genomes = collate_genomes(genomes=genomes)
actual_files = sorted(os.listdir(collated_genomes.path))
self.assertEqual(actual_files, exp_files)

for fn in actual_files:
fp = os.path.join(collated_genomes.path, fn)
with open(fp, "r") as fasta_file:
for seq in skbio.io.read(fasta_file, "fasta"):
actual_id = seq.metadata["id"]
actual_description = seq.metadata["description"]
actual_sequence = str(seq)
expected_id = fn.split(".")[0]
expected_desc = content[expected_id]["description"]
expected_sequence = content[expected_id]["sequence"]

self.assertEquals(actual_id, expected_id)
self.assertEqual(actual_description, expected_desc)
self.assertEqual(actual_sequence, expected_sequence)

def test_collate_genomes_genome_dir_multiple(self):
genomes1 = GenomeSequencesDirectoryFormat(
self.get_data_path("genomes-dir-format1"), "r"
)
genomes2 = GenomeSequencesDirectoryFormat(
self.get_data_path("genomes-dir-format2"), "r"
)
genomes = [genomes1, genomes2]
collated_genomes = collate_genomes(genomes=genomes)
exp_files = ["ref1.fasta", "ref2.fasta", "ref3.fasta"]
actual_files = sorted(os.listdir(collated_genomes.path))
self.assertEqual(exp_files, actual_files)

def test_collate_genomes_mix(self):
# should throw TypeError
genomes1 = DNAFASTAFormat(
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"), "r"
)
genomes2 = GenomeSequencesDirectoryFormat(
self.get_data_path("genomes-dir-format2"), "r"
)
genomes = [genomes2, genomes1]
with self.assertRaises(TypeError):
types.methods.collate_genomes(genomes=genomes)

def test_collate_genomes_duplicates_warn_genome(self):
self.helper_test_collate_genomes_duplicates_warn("GenomeData")

def test_collate_genomes_duplicates_warn_dna(self):
self.helper_test_collate_genomes_duplicates_warn("DNAFASTAFormat")

def helper_test_collate_genomes_duplicates_warn(self, dir_fmt):
duplicate_ids = (
["ref1.fasta", "ref2.fasta"]
if dir_fmt == "GenomeData"
else ["ref1", "ref2"]
)
warn_msg = (
"Duplicate sequence files were found for the following IDs: {}. "
"The latest occurrence will overwrite all previous occurrences "
"for each corresponding ID."
).format(", ".join(duplicate_ids))
if dir_fmt == "GenomeData":
genomes1 = GenomeSequencesDirectoryFormat(
self.get_data_path("genomes-dir-format1"), "r"
)
else:
genomes1 = DNAFASTAFormat(
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"),
"r"
)
with warnings.catch_warnings(record=True) as w:
collated_genomes = collate_genomes(genomes=[genomes1, genomes1])
exp_files = ["ref1.fasta", "ref2.fasta"]
actual_files = sorted(os.listdir(collated_genomes.path))
self.assertEqual(actual_files, exp_files)
self.assertEqual(warn_msg, str(w[0].message))

if dir_fmt == "DNAFASTAFormat":
content = {
"ref1": {"description": "d_Bacteria_1",
"sequence": "ACGTACGT"},
"ref2": {"description": "d_Bacteria_2",
"sequence": "CGTCGTCC"},
}

for fn in actual_files:
fp = os.path.join(collated_genomes.path, fn)
with open(fp, "r") as fasta_file:
for seq in skbio.io.read(fasta_file, "fasta"):
actual_id = seq.metadata["id"]
actual_description = seq.metadata["description"]
actual_sequence = str(seq)
expected_id = fn.split(".")[0]
expected_desc = content[expected_id]["description"]
exp_sequence = content[expected_id]["sequence"]

self.assertEquals(actual_id, expected_id)
self.assertEqual(actual_description, expected_desc)
self.assertEqual(actual_sequence, exp_sequence)

def test_collate_genomes_duplicates_error_genome(self):
self.helper_test_collate_genomes_duplicates_error("GenomeData")

def test_collate_genomes_duplicates_error_dna(self):
self.helper_test_collate_genomes_duplicates_error("DNAFASTAFormat")

def helper_test_collate_genomes_duplicates_error(self, dir_fmt):
duplicate_ids = ["ref3.fasta"] if dir_fmt == "GenomeData" else ["ref1"]
error_msg = (
"Duplicate sequence files were found for the "
"following IDs: %s." % ", ".join(duplicate_ids)
)
if dir_fmt == "GenomeData":
genomes1 = GenomeSequencesDirectoryFormat(
self.get_data_path("genomes-dir-format2"), "r"
)
else:
genomes1 = DNAFASTAFormat(
self.get_data_path("dna-fasta-format/dna-sequences1.fasta"),
"r"
)
with self.assertRaisesRegex(ValueError, error_msg):
collate_genomes(
genomes=[genomes1, genomes1], on_duplicates="error"
)
6 changes: 4 additions & 2 deletions q2_types/per_sample_sequences/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
JoinedSequencesWithQuality, MAGs, Contigs,
SingleBowtie2Index, MultiBowtie2Index,
AlignmentMap, MultiAlignmentMap)
from ._methods import partition_sample_data_mags, collate_sample_data_mags
from ._methods import partition_sample_data_mags, collate_sample_data_mags, \
collate_contigs, partition_contigs
from ._partitioners import partition_samples_paired, partition_samples_single


Expand All @@ -55,5 +56,6 @@
'BAMFormat', 'BAMDirFmt', 'MultiBAMDirFmt',
'MultiFASTADirectoryFormat', 'AlignmentMap', 'MultiAlignmentMap',
'partition_sample_data_mags', 'collate_sample_data_mags',
'partition_samples_single', 'partition_samples_paired'
'partition_samples_single', 'partition_samples_paired',
'collate_contigs', 'partition_contigs'
]
45 changes: 44 additions & 1 deletion q2_types/per_sample_sequences/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from qiime2.util import duplicate

from q2_types._util import _validate_num_partitions
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt, \
ContigSequencesDirFmt


def partition_sample_data_mags(
Expand Down Expand Up @@ -98,3 +99,45 @@ def collate_sample_data_mags(
dest.write(line)

return collated_mags


def partition_contigs(
contigs: ContigSequencesDirFmt, num_partitions: int = None
) -> ContigSequencesDirFmt:
partitioned_contigs = {}
contigs = [
(sample_id, sample_fp) for sample_id, sample_fp in
contigs.sample_dict().items()
]
num_samples = len(contigs)
num_partitions = _validate_num_partitions(
num_samples, num_partitions, "sample"
)

contigs = np.array_split(contigs, num_partitions)
for i, samples in enumerate(contigs, 1):
result = ContigSequencesDirFmt()

for sample_id, sample_fp in samples:
duplicate(sample_fp, result.path / os.path.basename(sample_fp))

# If num_partitions == num_samples we will only have gone through one
# sample in the above loop and will use its id as a key. Otherwise we
# may have gone through multiple samples in the above loop and will be
# using indices for keys
if num_partitions == num_samples:
partitioned_contigs[sample_id] = result
else:
partitioned_contigs[i] = result

return partitioned_contigs


def collate_contigs(contigs: ContigSequencesDirFmt) -> ContigSequencesDirFmt:
collated_contigs = ContigSequencesDirFmt()

for contig in contigs:
for fp in contig.path.iterdir():
duplicate(fp, collated_contigs.path / fp.name)

return collated_contigs
Loading
Loading