diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py index e62a089..e84158c 100644 --- a/tests/test_bcftools_validation.py +++ b/tests/test_bcftools_validation.py @@ -117,7 +117,8 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]: "view --no-version -i 'FILTER~\"VQSRTrancheINDEL99.00to100.00\"'", "1kg_2020_chrM.vcf.gz" ), - ("view --no-version -i 'INFO/AC>2'", "chr22.vcf.gz") + ("view --no-version -i 'INFO/AC>2'", "chr22.vcf.gz"), + ("view --no-version -i 'INFO/AC[0]>2'", "chr22.vcf.gz") ], # This is necessary when trying to run individual tests, as the arguments above # make for unworkable command lines diff --git a/tests/test_filter.py b/tests/test_filter.py index 83be0b6..f90fce8 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -38,20 +38,13 @@ def test_invalid_expressions(self, parser, expression): ('DP="."', filter_mod.UnsupportedMissingDataError), ("ID!=@~/file", filter_mod.UnsupportedFileReferenceError), ("INFO/TAG=@file", filter_mod.UnsupportedFileReferenceError), - ("INFO/X[0] == 1", filter_mod.UnsupportedArraySubscriptError), - ("INFO/AF[0] > 0.3", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/AD[0:0] > 30", filter_mod.UnsupportedArraySubscriptError), - ("DP4[*] == 0", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/DP[1-3] > 10", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/DP[1-] < 7", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/DP[0,2-4] > 20", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/AD[0:*]", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/AD[0:]", filter_mod.UnsupportedArraySubscriptError), ("FORMAT/AD[*:1]", filter_mod.UnsupportedArraySubscriptError), - ( - "(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3", - filter_mod.UnsupportedArraySubscriptError, - ), ("binom(FMT/AD)", filter_mod.UnsupportedFunctionsError), ("fisher(INFO/DP4)", filter_mod.UnsupportedFunctionsError), ("fisher(FMT/ADF,FMT/ADR)", filter_mod.UnsupportedFunctionsError), @@ -261,6 +254,25 @@ def test_evaluate_type_operation(self, expression, expected): result = fee.evaluate(numpify_values(data)) nt.assert_array_equal(result, expected) + @pytest.mark.parametrize( + ("expression", "expected"), + [ + ("INFO/AC>=2", [[0, 0], [1, 1], [0, 1], [1, 0]]), + ("INFO/AC[*]>=2", [[0, 0], [1, 1], [0, 1], [1, 0]]), + ("INFO/AC[0]>=2", [0, 1, 0, 1]), + ("INFO/AC[1]>=2", [0, 1, 1, 0]), + ], + ) + def test_evaluate_array_subscripts(self, expression, expected): + data = { + "variant_AC": [[1, -1], [5, 4], [1, 4], [2, -1]], + } + fee = filter_mod.FilterExpression( + field_names={"variant_AC"}, include=expression + ) + result = fee.evaluate(numpify_values(data)) + nt.assert_array_equal(result, expected) + @pytest.mark.parametrize( ("expr", "expected"), [ diff --git a/vcztools/filter.py b/vcztools/filter.py index b5f0baa..b519a5e 100644 --- a/vcztools/filter.py +++ b/vcztools/filter.py @@ -99,6 +99,11 @@ class Number(Constant): pass +class Integer(Constant): + def eval(self, data): + return int(self.tokens) + + class String(Constant): def __init__(self, tokens): super().__init__(tokens) @@ -146,11 +151,29 @@ def referenced_fields(self): return frozenset([self.field_name]) +class IndexAny(Constant): + def eval(self, data): + return Ellipsis + + class IndexedIdentifier(EvaluationNode): def __init__(self, tokens): - # The tokens here are the already resolved idenfitier - # and the index - raise UnsupportedArraySubscriptError() + token = tokens[0] + # The tokens here are the already resolved identifier + # and the index - but only in the case of a single element + # index (an int), or any (*) + if len(token) > 2: + raise UnsupportedArraySubscriptError() + self.identifier = token[0] + self.index = token[1] + + def eval(self, data): + val = self.identifier.eval(data) + ind = self.index.eval(data) + return val[:, ind] # index samples dim + + def referenced_fields(self): + return self.identifier.referenced_fields() class RegexOperator(EvaluationNode): @@ -523,9 +546,11 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True): # TODO we need to define the indexing grammar more carefully, but # this at least let's us match correct strings and raise an informative # error + index_single_element_expr = pp.Word(pp.nums).set_parse_action(Integer) + index_any_element_expr = pp.Literal("*").set_parse_action(IndexAny) index_expr = pp.OneOrMore( - pp.common.number - | pp.Literal("*") + index_single_element_expr + | index_any_element_expr | pp.Literal(":") | pp.Literal("-") | pp.Literal(",")