Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh
- name: Check vcztools CLI
run: |
pip install numpy "zarr>=2.17,<3" "click>=8.2.0" "pyranges!=0.1.3" pyparsing
pip install numpy "zarr>=2.17,<3" "click>=8.2.0" pandas pyparsing ruranges
pip install vcztools --no-index --only-binary vcztools -f dist/wheelhouse
vcztools --help
# Make sure we don't have ``vcztools`` in the CWD
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ dependencies = [
"numpy>=1.23.5",
"zarr>=2.17,<3",
"click>=8.2.0",
"pyranges!=0.1.3",
"pyparsing>=3"
"pandas",
"pyparsing>=3",
"ruranges"
]
requires-python = ">=3.10"
classifiers = [
Expand Down
117 changes: 62 additions & 55 deletions vcztools/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from typing import Any

import numpy as np
import pandas as pd
from pyranges import PyRanges
from ruranges import complement_overlaps, overlaps


def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
Expand All @@ -20,12 +19,41 @@ def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
return contig, None, None


def regions_to_pyranges(
class GenomicRanges:
def __init__(self, contigs, starts, ends):
# note that ruranges groups must be unsigned
self.contigs = np.ascontiguousarray(contigs, dtype=np.uint64)
self.starts = np.ascontiguousarray(starts)
self.ends = np.ascontiguousarray(ends)

def overlaps(self, other: "GenomicRanges"):
overlap = overlaps(
groups=self.contigs,
starts=self.starts,
ends=self.ends,
groups2=other.contigs,
starts2=other.starts,
ends2=other.ends,
)
return overlap[0] # indices of overlapping regions with self

def complement_overlaps(self, other: "GenomicRanges"):
return complement_overlaps(
groups=self.contigs,
starts=self.starts,
ends=self.ends,
groups2=other.contigs,
starts2=other.starts,
ends2=other.ends,
)


def regions_to_ranges(
regions: list[tuple[str, int | None, int | None]], all_contigs: list[str]
) -> PyRanges:
"""Convert region tuples to a PyRanges object."""
) -> GenomicRanges:
"""Convert region tuples to a GenomicRanges object."""

chromosomes = []
contigs = []
starts = []
ends = []
for contig, start, end in regions:
Expand All @@ -37,33 +65,33 @@ def regions_to_pyranges(
if end is None:
end = np.iinfo(np.int64).max

chromosomes.append(all_contigs.index(contig))
contigs.append(all_contigs.index(contig))
starts.append(start)
ends.append(end)

return PyRanges(chromosomes=chromosomes, starts=starts, ends=ends)
return GenomicRanges(np.array(contigs), np.array(starts), np.array(ends))


def parse_regions(
regions: list[str] | str | None, all_contigs: list[str]
) -> PyRanges | None:
"""Return a PyRanges object from a comma-separated set of region strings,
) -> GenomicRanges | None:
"""Return a GenomicRanges object from a comma-separated set of region strings,
or a list of region strings."""
if regions is None:
return None
elif isinstance(regions, list):
regions_list = regions
else:
regions_list = regions.split(",")
return regions_to_pyranges(
return regions_to_ranges(
[parse_region_string(region) for region in regions_list], all_contigs
)


def parse_targets(
targets: list[str] | str | None, all_contigs: list[str]
) -> tuple[PyRanges | None, bool]:
"""Return a PyRanges object from a comma-separated set of region strings,
) -> tuple[GenomicRanges | None, bool]:
"""Return a GenomicRanges object from a comma-separated set of region strings,
optionally preceeded by a ^ character to indicate complement,
or a list of region strings."""
if targets is None:
Expand All @@ -81,8 +109,8 @@ def parse_targets(


def regions_to_chunk_indexes(
regions: PyRanges | None,
targets: PyRanges | None,
regions: GenomicRanges | None,
targets: GenomicRanges | None,
complement: bool,
regions_index: Any,
):
Expand All @@ -95,39 +123,33 @@ def regions_to_chunk_indexes(
taking into account the complement flag.
"""

# Create pyranges for chunks using the region index.
# Create GenomicRanges for chunks using the region index.
# For regions use max end position, for targets just end position
chunk_index = regions_index[:, 0]
contig_id = regions_index[:, 1]
start_position = regions_index[:, 2]
end_position = regions_index[:, 3]
max_end_position = regions_index[:, 4]
df = pd.DataFrame(
{
"chunk_index": chunk_index,
"Chromosome": contig_id,
"Start": start_position,
"End": max_end_position if regions is not None else end_position,
}
chunk_regions = GenomicRanges(
contig_id,
start_position,
max_end_position if regions is not None else end_position,
)
chunk_regions = PyRanges(df)

if regions is not None:
overlap = chunk_regions.overlap(regions)
overlap = chunk_regions.overlaps(regions)
elif complement:
overlap = chunk_regions.subtract(targets)
overlap = chunk_regions.complement_overlaps(targets)
else:
overlap = chunk_regions.overlap(targets)
if overlap.empty:
return np.empty((0,), dtype=np.int64)
chunk_indexes = overlap.df["chunk_index"].to_numpy()
overlap = chunk_regions.overlaps(targets)
chunk_indexes = chunk_index[overlap]
chunk_indexes = np.unique(chunk_indexes)
return chunk_indexes


def regions_to_selection(
regions: PyRanges | None,
targets: PyRanges | None,
regions: GenomicRanges | None,
targets: GenomicRanges | None,
complement: bool,
variant_contig: Any,
variant_position: Any,
Expand All @@ -145,50 +167,35 @@ def regions_to_selection(

if regions is not None:
variant_end = variant_start + variant_length
df = pd.DataFrame(
{"Chromosome": variant_contig, "Start": variant_start, "End": variant_end}
)
# save original index as column so we can retrieve it after finding overlap
df["index"] = df.index
variant_regions = PyRanges(df)
variant_regions = GenomicRanges(variant_contig, variant_start, variant_end)
else:
variant_regions = None

if targets is not None:
targets_variant_end = variant_position # length 1
df = pd.DataFrame(
{
"Chromosome": variant_contig,
"Start": variant_start,
"End": targets_variant_end,
}
variant_targets = GenomicRanges(
variant_contig, variant_start, targets_variant_end
)
# save original index as column so we can retrieve it after finding overlap
df["index"] = df.index
variant_targets = PyRanges(df)
else:
variant_targets = None

if variant_regions is not None:
regions_overlap = variant_regions.overlap(regions)
regions_overlap = variant_regions.overlaps(regions)
else:
regions_overlap = None

if variant_targets is not None:
if complement:
targets_overlap = variant_targets.subtract(targets)
targets_overlap = variant_targets.complement_overlaps(targets)
else:
targets_overlap = variant_targets.overlap(targets)
targets_overlap = variant_targets.overlaps(targets)
else:
targets_overlap = None

if regions_overlap is not None and targets_overlap is not None:
overlap = regions_overlap.overlap(targets_overlap)
overlap = np.intersect1d(regions_overlap, targets_overlap)
elif regions_overlap is not None:
overlap = regions_overlap
else:
overlap = targets_overlap

if overlap.empty:
return np.empty((0,), dtype=np.int64)
return overlap.df["index"].to_numpy()
return overlap
12 changes: 6 additions & 6 deletions vcztools/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,15 @@ def variant_chunk_index_iter(root, regions=None, targets=None):

else:
contigs_u = root["contig_id"][:].astype("U").tolist()
regions_pyranges = parse_regions(regions, contigs_u)
targets_pyranges, complement = parse_targets(targets, contigs_u)
regions_ranges = parse_regions(regions, contigs_u)
targets_ranges, complement = parse_targets(targets, contigs_u)

# Use the region index to find the chunks that overlap specfied regions or
# targets
region_index = root["region_index"][:]
chunk_indexes = regions_to_chunk_indexes(
regions_pyranges,
targets_pyranges,
regions_ranges,
targets_ranges,
complement,
region_index,
)
Expand All @@ -117,8 +117,8 @@ def variant_chunk_index_iter(root, regions=None, targets=None):

# Find the final variant selection
variant_selection = regions_to_selection(
regions_pyranges,
targets_pyranges,
regions_ranges,
targets_ranges,
complement,
region_variant_contig,
region_variant_position,
Expand Down
Loading