Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion q2_types/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,20 @@
# ----------------------------------------------------------------------------
import gzip
import itertools
import os
import re
import warnings
from collections import defaultdict
from typing import List
from typing import List, TypeVar

import skbio
import pandas as pd

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError
from qiime2.util import duplicate

DirFmt = TypeVar("DirFmt", bound=model.DirectoryFormat)


def read_from_fasta(path, constructor=skbio.DNA, lowercase=False):
Expand Down Expand Up @@ -233,3 +237,47 @@ class does not have a suffixes attribute, then the ID is defined to
else path.absolute()
)
return str(processed_path), _id


def _duplicate_with_warning(src, dst):
try:
duplicate(src, dst)
except FileExistsError:
warnings.warn(
f"Skipping {src}. File already "
f"exists in the destination directory."
)


def _collate_helper(dir_fmts: List[DirFmt]) -> DirFmt:
"""
Iterates through a list of directory formats, merging their contents
into a single directory. Can be used with per sample directories and
without. Handles duplicate files by issuing warnings when conflicts occur.

Parameters:
dir_fmts (iterable):
A List of directory format objects to be collated.

Returns:
object:
The updated `collated` directory format object containing all
merged files and subdirectories.
"""
# Initialize the collated directory format with the same class as inputs
collated = dir_fmts[0].__class__()

for dir_fmt in dir_fmts:
for item in dir_fmt.path.iterdir():
target = collated.path / item.name
# Per sample directories
if item.is_dir():
target.mkdir(exist_ok=True)
for file in item.iterdir():
_duplicate_with_warning(file, target / file.name)
# Non per sample directories
else:
_duplicate_with_warning(
item, collated.path / os.path.basename(item)
)
return collated
7 changes: 4 additions & 3 deletions q2_types/genome_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
GenomeData, Genes, Proteins, Loci, Orthologs, DNASequence, NOG
)
from ._methods import collate_orthologs, partition_orthologs, \
collate_ortholog_annotations, collate_genomes
collate_ortholog_annotations, collate_genomes, collate_loci, \
collate_genes, collate_proteins

__all__ = [
'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format',
'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat',
'IntervalMetadataIterator', 'OrthologFileFmt', 'Orthologs',
'SeedOrthologDirFmt', 'GenomeSequencesDirectoryFormat', 'DNASequence',
'OrthologAnnotationDirFmt', 'NOG',
'collate_orthologs', 'partition_orthologs', "collate_ortholog_annotations",
'collate_genomes'
'collate_orthologs', 'partition_orthologs', 'collate_ortholog_annotations',
'collate_genomes', 'collate_loci', 'collate_genes', "collate_proteins"
]
64 changes: 25 additions & 39 deletions q2_types/genome_data/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,35 @@
import skbio
from qiime2.util import duplicate

from q2_types._util import _collate_helper
from q2_types.feature_data import DNAIterator, DNAFASTAFormat
from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt,
LociDirectoryFormat,
GenomeSequencesDirectoryFormat)
from q2_types.genome_data import (
SeedOrthologDirFmt, OrthologAnnotationDirFmt, LociDirectoryFormat,
GenomeSequencesDirectoryFormat, GenesDirectoryFormat,
ProteinsDirectoryFormat
)


def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat:
"""
Collate the individual loci directories from the partitions.
Parameters:
- loci: A list of LociDirectoryFormat containing the gff files.
Returns:
- collated_loci: A LociDirectoryFormat object containing the
collated gff files.
"""
collated_loci = LociDirectoryFormat()
for loci_dir in loci:
for fp in loci_dir.path.iterdir():
try:
duplicate(
fp,
collated_loci.path / os.path.basename(fp)
)
except FileExistsError:
warnings.warn(
f"Skipping {fp}. File already exists "
f"in the destination directory."
)
return collated_loci
return _collate_helper(dir_fmts=loci)


def collate_ortholog_annotations(
ortholog_annotations: OrthologAnnotationDirFmt
) -> OrthologAnnotationDirFmt:
return _collate_helper(dir_fmts=ortholog_annotations)


def collate_genes(
genes: GenesDirectoryFormat
) -> GenesDirectoryFormat:
return _collate_helper(dir_fmts=genes)


def collate_proteins(
proteins: ProteinsDirectoryFormat
) -> ProteinsDirectoryFormat:
return _collate_helper(dir_fmts=proteins)


def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt:
Expand Down Expand Up @@ -106,20 +106,6 @@ def partition_orthologs(
return partitioned_orthologs


def collate_ortholog_annotations(
ortholog_annotations: OrthologAnnotationDirFmt
) -> OrthologAnnotationDirFmt:
# Init output
collated_annotations = OrthologAnnotationDirFmt()

# Copy annotations into output
for anno in ortholog_annotations:
for fp in anno.path.iterdir():
duplicate(fp, collated_annotations.path / fp.name)

return collated_annotations


def collate_genomes(
genomes: Union[DNAFASTAFormat, GenomeSequencesDirectoryFormat],
on_duplicates: str = "warn",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>gene1 some_description1
GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT
ATAAACAGTGCTGGAGGC
>gene2 some_description2
CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA
CGGGGGGCCTTGG
>gene3 some_description3
GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA
CTGGAGG
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>gene11 some_description11
ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT
ATAAACAGTGCTGGAGGC
>gene12 some_description12
AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA
CGGGGGGCCTTGG
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>k129_5480_1 # 3 # 1988 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442
MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYRVVLINSNPATIMTDPNM
ADATYIEPITP
>k129_5480_2 # 2150 # 2623 # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426
MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLSENAEYHAAREQQSFIEG
RIEQVEAILSLAEIIDPAK
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>k129_5112_1 # 1 # 1218 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.669
MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTATAEEIAAKDPIGIILSG
GPSSVYEPGAPTLDPAVFDLGVP
>k129_5112_2 # 1261 # 1797 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.650
MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYSPLHVELFAGGSGGFLWF
APVLVG
110 changes: 83 additions & 27 deletions q2_types/genome_data/tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,19 @@


from q2_types.feature_data import DNAFASTAFormat
from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \
partition_orthologs, OrthologAnnotationDirFmt, \
collate_ortholog_annotations, GenomeSequencesDirectoryFormat
from q2_types.genome_data import LociDirectoryFormat
from q2_types.genome_data._methods import collate_loci, collate_genomes


class TestOrthologsPartitionCollating(TestPluginBase):
from q2_types.genome_data import (
SeedOrthologDirFmt, OrthologAnnotationDirFmt,
GenomeSequencesDirectoryFormat, GenesDirectoryFormat,
ProteinsDirectoryFormat, LociDirectoryFormat
)
from q2_types.genome_data._methods import (
collate_loci, collate_genomes, collate_orthologs, partition_orthologs,
collate_genes, collate_proteins, collate_ortholog_annotations
)
from q2_types._util import _collate_helper


class TestPartitionCollating(TestPluginBase):
package = "q2_types.genome_data.tests"

def test_collate_orthologs(self):
Expand All @@ -33,13 +38,78 @@ def test_collate_orthologs(self):
SeedOrthologDirFmt(p2, mode="r")
]

collated_orthologs = collate_orthologs(orthologs)
collated = collate_orthologs(orthologs)
self.assertTrue(os.path.exists(
collated.path / "1.emapper.seed_orthologs")
)
self.assertTrue(os.path.exists(
collated.path / "2.emapper.seed_orthologs")
)
self.assertIsInstance(collated, SeedOrthologDirFmt)

def test_collate_helper(self):
p1 = self.get_data_path("partitioned_genes/1/sample1")
p2 = self.get_data_path("partitioned_genes/2/sample1")
dir_fmts = [
GenesDirectoryFormat(p1, mode="r"),
GenesDirectoryFormat(p2, mode="r")
]
collated = _collate_helper(dir_fmts)
self.assertTrue(os.path.exists(
collated.path / "genes1.fa")
)
self.assertTrue(os.path.exists(
collated.path / "genes2.fa")
)
self.assertIsInstance(collated, GenesDirectoryFormat)

def test_collate_helper_sample_data(self):
p1 = self.get_data_path("partitioned_genes/1")
p2 = self.get_data_path("partitioned_genes/2")
dir_fmts = [
GenesDirectoryFormat(p1, mode="r"),
GenesDirectoryFormat(p2, mode="r")
]
collated = _collate_helper(dir_fmts)
self.assertTrue(os.path.exists(
collated.path / "sample1" / "genes1.fa")
)
self.assertTrue(os.path.exists(
collated.path / "sample1" / "genes2.fa")
)
self.assertIsInstance(collated, GenesDirectoryFormat)

def test_collate_genes(self):
p1 = self.get_data_path("partitioned_genes/1/sample1")
p2 = self.get_data_path("partitioned_genes/2/sample1")
genes = [
GenesDirectoryFormat(p1, mode="r"),
GenesDirectoryFormat(p2, mode="r")
]
collated = collate_genes(genes)
self.assertTrue(os.path.exists(
collated.path / "genes1.fa")
)
self.assertTrue(os.path.exists(
collated_orthologs.path / "1.emapper.seed_orthologs")
collated.path / "genes2.fa")
)
self.assertIsInstance(collated, GenesDirectoryFormat)

def test_collate_proteins(self):
p1 = self.get_data_path("partitioned_proteins/1")
p2 = self.get_data_path("partitioned_proteins/2")
proteins = [
ProteinsDirectoryFormat(p1, mode="r"),
ProteinsDirectoryFormat(p2, mode="r")
]
collated = collate_proteins(proteins)
self.assertTrue(os.path.exists(
collated.path / "proteins1.faa")
)
self.assertTrue(os.path.exists(
collated_orthologs.path / "2.emapper.seed_orthologs")
collated.path / "proteins2.faa")
)
self.assertIsInstance(collated, ProteinsDirectoryFormat)

def test_collate_loci(self):
p1 = self.get_data_path("uncollated_loci_1")
Expand All @@ -53,20 +123,6 @@ def test_collate_loci(self):
self.assertTrue(all(os.path.exists(
collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4]))

def test_collate_loci_file_exists(self):
p1 = self.get_data_path("uncollated_loci_1")
loci_list = [
LociDirectoryFormat(p1, mode="r"),
LociDirectoryFormat(p1, mode="r")
]

with warnings.catch_warnings(record=True) as w:
collated_loci = collate_loci(loci_list)
self.assertIn("File already exists", str(w[-1].message))

self.assertTrue(all(os.path.exists(
collated_loci.path / f"loci{no}.gff") for no in [1, 2]))

def test_partition_orthologs(self):
p = self.get_data_path("collated_orthologs")
orthologs = SeedOrthologDirFmt(path=p, mode="r")
Expand Down Expand Up @@ -159,7 +215,7 @@ def helper_test_collate_genomes_dnafastaformat(self, input):
expected_desc = content[expected_id]["description"]
expected_sequence = content[expected_id]["sequence"]

self.assertEquals(actual_id, expected_id)
self.assertEqual(actual_id, expected_id)
self.assertEqual(actual_description, expected_desc)
self.assertEqual(actual_sequence, expected_sequence)

Expand Down Expand Up @@ -240,7 +296,7 @@ def helper_test_collate_genomes_duplicates_warn(self, dir_fmt):
expected_desc = content[expected_id]["description"]
exp_sequence = content[expected_id]["sequence"]

self.assertEquals(actual_id, expected_id)
self.assertEqual(actual_id, expected_id)
self.assertEqual(actual_description, expected_desc)
self.assertEqual(actual_sequence, exp_sequence)

Expand Down
Loading
Loading