diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 9e696a0..9e83d23 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -36,7 +36,7 @@ jobs: docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh - name: Check vcztools CLI run: | - pip install numpy "zarr>=2.17,<3" "click>=8.2.0" "pyranges!=0.1.3" pyparsing + pip install numpy "zarr>=2.17,<3" "click>=8.2.0" pandas pyparsing ruranges pip install vcztools --no-index --only-binary vcztools -f dist/wheelhouse vcztools --help # Make sure we don't have ``vcztools`` in the CWD diff --git a/pyproject.toml b/pyproject.toml index 37cc3f7..703adca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,8 +19,9 @@ dependencies = [ "numpy>=1.23.5", "zarr>=2.17,<3", "click>=8.2.0", - "pyranges!=0.1.3", - "pyparsing>=3" + "pandas", + "pyparsing>=3", + "ruranges" ] requires-python = ">=3.10" classifiers = [ diff --git a/vcztools/regions.py b/vcztools/regions.py index 6e542d3..1dfaf9e 100644 --- a/vcztools/regions.py +++ b/vcztools/regions.py @@ -2,8 +2,7 @@ from typing import Any import numpy as np -import pandas as pd -from pyranges import PyRanges +from ruranges import complement_overlaps, overlaps def parse_region_string(region: str) -> tuple[str, int | None, int | None]: @@ -20,12 +19,41 @@ def parse_region_string(region: str) -> tuple[str, int | None, int | None]: return contig, None, None -def regions_to_pyranges( +class GenomicRanges: + def __init__(self, contigs, starts, ends): + # note that ruranges groups must be unsigned + self.contigs = np.ascontiguousarray(contigs, dtype=np.uint64) + self.starts = np.ascontiguousarray(starts) + self.ends = np.ascontiguousarray(ends) + + def overlaps(self, other: "GenomicRanges"): + overlap = overlaps( + groups=self.contigs, + starts=self.starts, + ends=self.ends, + groups2=other.contigs, + starts2=other.starts, + ends2=other.ends, + ) + return overlap[0] # indices of overlapping regions with self + + def complement_overlaps(self, other: "GenomicRanges"): + return complement_overlaps( + groups=self.contigs, + starts=self.starts, + ends=self.ends, + groups2=other.contigs, + starts2=other.starts, + ends2=other.ends, + ) + + +def regions_to_ranges( regions: list[tuple[str, int | None, int | None]], all_contigs: list[str] -) -> PyRanges: - """Convert region tuples to a PyRanges object.""" +) -> GenomicRanges: + """Convert region tuples to a GenomicRanges object.""" - chromosomes = [] + contigs = [] starts = [] ends = [] for contig, start, end in regions: @@ -37,17 +65,17 @@ def regions_to_pyranges( if end is None: end = np.iinfo(np.int64).max - chromosomes.append(all_contigs.index(contig)) + contigs.append(all_contigs.index(contig)) starts.append(start) ends.append(end) - return PyRanges(chromosomes=chromosomes, starts=starts, ends=ends) + return GenomicRanges(np.array(contigs), np.array(starts), np.array(ends)) def parse_regions( regions: list[str] | str | None, all_contigs: list[str] -) -> PyRanges | None: - """Return a PyRanges object from a comma-separated set of region strings, +) -> GenomicRanges | None: + """Return a GenomicRanges object from a comma-separated set of region strings, or a list of region strings.""" if regions is None: return None @@ -55,15 +83,15 @@ def parse_regions( regions_list = regions else: regions_list = regions.split(",") - return regions_to_pyranges( + return regions_to_ranges( [parse_region_string(region) for region in regions_list], all_contigs ) def parse_targets( targets: list[str] | str | None, all_contigs: list[str] -) -> tuple[PyRanges | None, bool]: - """Return a PyRanges object from a comma-separated set of region strings, +) -> tuple[GenomicRanges | None, bool]: + """Return a GenomicRanges object from a comma-separated set of region strings, optionally preceeded by a ^ character to indicate complement, or a list of region strings.""" if targets is None: @@ -81,8 +109,8 @@ def parse_targets( def regions_to_chunk_indexes( - regions: PyRanges | None, - targets: PyRanges | None, + regions: GenomicRanges | None, + targets: GenomicRanges | None, complement: bool, regions_index: Any, ): @@ -95,39 +123,33 @@ def regions_to_chunk_indexes( taking into account the complement flag. """ - # Create pyranges for chunks using the region index. + # Create GenomicRanges for chunks using the region index. # For regions use max end position, for targets just end position chunk_index = regions_index[:, 0] contig_id = regions_index[:, 1] start_position = regions_index[:, 2] end_position = regions_index[:, 3] max_end_position = regions_index[:, 4] - df = pd.DataFrame( - { - "chunk_index": chunk_index, - "Chromosome": contig_id, - "Start": start_position, - "End": max_end_position if regions is not None else end_position, - } + chunk_regions = GenomicRanges( + contig_id, + start_position, + max_end_position if regions is not None else end_position, ) - chunk_regions = PyRanges(df) if regions is not None: - overlap = chunk_regions.overlap(regions) + overlap = chunk_regions.overlaps(regions) elif complement: - overlap = chunk_regions.subtract(targets) + overlap = chunk_regions.complement_overlaps(targets) else: - overlap = chunk_regions.overlap(targets) - if overlap.empty: - return np.empty((0,), dtype=np.int64) - chunk_indexes = overlap.df["chunk_index"].to_numpy() + overlap = chunk_regions.overlaps(targets) + chunk_indexes = chunk_index[overlap] chunk_indexes = np.unique(chunk_indexes) return chunk_indexes def regions_to_selection( - regions: PyRanges | None, - targets: PyRanges | None, + regions: GenomicRanges | None, + targets: GenomicRanges | None, complement: bool, variant_contig: Any, variant_position: Any, @@ -145,50 +167,35 @@ def regions_to_selection( if regions is not None: variant_end = variant_start + variant_length - df = pd.DataFrame( - {"Chromosome": variant_contig, "Start": variant_start, "End": variant_end} - ) - # save original index as column so we can retrieve it after finding overlap - df["index"] = df.index - variant_regions = PyRanges(df) + variant_regions = GenomicRanges(variant_contig, variant_start, variant_end) else: variant_regions = None if targets is not None: targets_variant_end = variant_position # length 1 - df = pd.DataFrame( - { - "Chromosome": variant_contig, - "Start": variant_start, - "End": targets_variant_end, - } + variant_targets = GenomicRanges( + variant_contig, variant_start, targets_variant_end ) - # save original index as column so we can retrieve it after finding overlap - df["index"] = df.index - variant_targets = PyRanges(df) else: variant_targets = None if variant_regions is not None: - regions_overlap = variant_regions.overlap(regions) + regions_overlap = variant_regions.overlaps(regions) else: regions_overlap = None if variant_targets is not None: if complement: - targets_overlap = variant_targets.subtract(targets) + targets_overlap = variant_targets.complement_overlaps(targets) else: - targets_overlap = variant_targets.overlap(targets) + targets_overlap = variant_targets.overlaps(targets) else: targets_overlap = None if regions_overlap is not None and targets_overlap is not None: - overlap = regions_overlap.overlap(targets_overlap) + overlap = np.intersect1d(regions_overlap, targets_overlap) elif regions_overlap is not None: overlap = regions_overlap else: overlap = targets_overlap - - if overlap.empty: - return np.empty((0,), dtype=np.int64) - return overlap.df["index"].to_numpy() + return overlap diff --git a/vcztools/retrieval.py b/vcztools/retrieval.py index 4fde829..0f98bda 100644 --- a/vcztools/retrieval.py +++ b/vcztools/retrieval.py @@ -87,15 +87,15 @@ def variant_chunk_index_iter(root, regions=None, targets=None): else: contigs_u = root["contig_id"][:].astype("U").tolist() - regions_pyranges = parse_regions(regions, contigs_u) - targets_pyranges, complement = parse_targets(targets, contigs_u) + regions_ranges = parse_regions(regions, contigs_u) + targets_ranges, complement = parse_targets(targets, contigs_u) # Use the region index to find the chunks that overlap specfied regions or # targets region_index = root["region_index"][:] chunk_indexes = regions_to_chunk_indexes( - regions_pyranges, - targets_pyranges, + regions_ranges, + targets_ranges, complement, region_index, ) @@ -117,8 +117,8 @@ def variant_chunk_index_iter(root, regions=None, targets=None): # Find the final variant selection variant_selection = regions_to_selection( - regions_pyranges, - targets_pyranges, + regions_ranges, + targets_ranges, complement, region_variant_contig, region_variant_position,