Merge pull request #740 from broadinstitute/jg/determine_end_trunc_filter_from_gerp

jkgoodrich · web-flow · commit 52a49c861502 · 2025-09-30T16:38:09.000-04:00
Add `get_loftee_end_trunc_filter_expr` and `update_loftee_end_trunc_filter` to patch the LOFTEE END_TRUNC filter
diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py
@@ -1047,3 +1047,103 @@ def explode_by_vep_annotation(
         t = t.explode_rows(t[vep_annotation])
 
     return t
+
+
+def get_loftee_end_trunc_filter_expr(
+    csq_expr: hl.expr.StructExpression,
+    gerp_dist_cutoff: float = 0.0,
+) -> hl.expr.BooleanExpression:
+    """
+    Get the expression for LOFTEE's END_TRUNC filter based on the GERP distance cutoff.
+
+    The end truncation filter is based on the GERP distance cutoff (`gerp_dist_cutoff`)
+    and the 'GERP_DIST' and '50_BP_RULE' annotations in the LOFTEE annotations.
+
+    True is returned if the GERP distance is less than the cutoff and the '50_BP_RULE'
+    annotation is not 'PASS'.
+
+    The END_TRUNC filter is designed to identify loss-of-function variants that occur
+    near the end of transcripts, which are often less likely to cause true loss of
+    function.
+
+    GERP (Genomic Evolutionary Rate Profiling) scores measure evolutionary constraint,
+    with positive values indicating conservation and negative values indicating less
+    constraint. The default cutoff of 0.0 means that variants in regions with any level
+    of evolutionary constraint (GERP >= 0) will NOT be filtered as END_TRUNC, while
+    variants in less constrained regions (GERP < 0) will be filtered.
+
+    The filter also requires that the '50_BP_RULE' annotation is not 'PASS', meaning
+    the variant falls within the last 50 base pairs of the transcript where Nonsense
+    Mediated Decay (NMD) escape is more likely.
+
+    :param csq_expr: StructExpression containing the LOFTEE annotation 'lof_info', with
+        'GERP_DIST' and '50_BP_RULE' info.
+    :param gerp_dist_cutoff: GERP distance cutoff for end truncation. Default is 0.0.
+    :return: BooleanExpression for end truncation annotation.
+    """
+    lof_info_expr = hl.dict(
+        csq_expr.lof_info.split(",")
+        .map(lambda x: x.split(":"))
+        .map(lambda x: (x[0], hl.or_missing(x.length() > 1, x[1])))
+    )
+
+    end_trunc_expr = hl.or_else(
+        hl.float64(lof_info_expr.get("GERP_DIST", "0")) < gerp_dist_cutoff, False
+    ) & hl.or_else(lof_info_expr.get("50_BP_RULE", "") != "PASS", False)
+
+    return end_trunc_expr
+
+
+def update_loftee_end_trunc_filter(
+    csq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
+    gerp_dist_cutoff: float = 0.0,
+) -> hl.expr.StructExpression:
+    """
+    Update the LOFTEE end truncation filter in the input Struct or Array of Structs.
+
+    The LOFTEE end truncation filter is updated based on the GERP distance cutoff
+    (`gerp_dist_cutoff`) using `get_loftee_end_trunc_filter_expr`.
+
+    The 'lof_filter' field in the input Struct or Array of Structs is updated to include
+    'END_TRUNC' if the end truncation filter is met, and 'END_TRUNC' is removed if the
+    end truncation filter is not met.
+
+    Then the 'lof' field in the input Struct or Array of Structs is updated to 'HC' if
+    the new 'lof_filter' is missing, and 'LC' if it's not missing.
+
+    :param csq_expr: Struct or Array of Structs containing the LOFTEE annotations.
+    :param gerp_dist_cutoff: GERP distance cutoff for end truncation. Default is 0.0.
+    :return: Struct or Array of Structs with updated LOFTEE end truncation filter
+        annotation.
+    """
+
+    def _update_csq_struct(csq_expr: hl.expr.StructExpression):
+        """
+        Update the LOFTEE end truncation filter in the input Struct.
+
+        :param csq_expr: Struct containing the LOFTEE annotations.
+        :return: Consequence Struct with updated LOFTEE annotations.
+        """
+        end_trunc_expr = get_loftee_end_trunc_filter_expr(csq_expr, gerp_dist_cutoff)
+        filter_parts = csq_expr.lof_filter.split(",")
+        filter_expr = hl.or_else(
+            hl.set(filter_parts.filter(lambda x: x != "")), hl.empty_set(hl.tstr)
+        )
+        filter_expr = hl.if_else(
+            end_trunc_expr,
+            filter_expr.add("END_TRUNC"),
+            filter_expr.remove("END_TRUNC"),
+        )
+        filter_expr = hl.or_missing(filter_expr.length() > 0, hl.delimit(filter_expr))
+
+        lof_expr = hl.or_missing(
+            hl.is_defined(csq_expr.lof),
+            hl.if_else(hl.is_missing(filter_expr), "HC", "LC"),
+        )
+
+        return hl.struct(lof_filter=filter_expr, lof=lof_expr)
+
+    if isinstance(csq_expr, hl.expr.StructExpression):
+        return csq_expr.annotate(**_update_csq_struct(csq_expr))
+    else:
+        return csq_expr.map(lambda x: x.annotate(**_update_csq_struct(x)))
diff --git a/tests/utils/test_vep.py b/tests/utils/test_vep.py
@@ -0,0 +1,258 @@
+"""Tests for the VEP utility module."""
+
+import hail as hl
+import pytest
+
+from gnomad.utils.vep import (
+    get_loftee_end_trunc_filter_expr,
+    update_loftee_end_trunc_filter,
+)
+
+
+class TestGetLofteeEndTruncFilterExpr:
+    """Test the get_loftee_end_trunc_filter_expr function."""
+
+    @pytest.fixture
+    def sample_csq_structs(self):
+        """Fixture to create sample consequence structs with different LOFTEE annotations."""
+        return [
+            # Case 1: GERP_DIST < 0, 50_BP_RULE != PASS -> should be True for default.
+            hl.Struct(lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL,OTHER:value"),
+            # Case 2: GERP_DIST >= 0, 50_BP_RULE != PASS -> should be False for default.
+            hl.Struct(lof_info="GERP_DIST:1.5,50_BP_RULE:FAIL,OTHER:value"),
+            # Case 3: GERP_DIST < 0, 50_BP_RULE = PASS -> should be False for default.
+            hl.Struct(lof_info="GERP_DIST:-1.0,50_BP_RULE:PASS,OTHER:value"),
+            # Case 4: GERP_DIST >= 0, 50_BP_RULE = PASS -> should be False for default.
+            hl.Struct(lof_info="GERP_DIST:0.5,50_BP_RULE:PASS,OTHER:value"),
+            # Case 5: GERP_DIST >= 0, 50_BP_RULE != PASS -> should be False for default.
+            hl.Struct(lof_info="GERP_DIST:0.5,50_BP_RULE:FAIL,OTHER:value"),
+            # Case 6: Missing GERP_DIST (defaults to 0), 50_BP_RULE != PASS -> should
+            # be False for default.
+            hl.Struct(lof_info="50_BP_RULE:FAIL,OTHER:value"),
+            # Case 7: GERP_DIST < 0, missing 50_BP_RULE (defaults to empty) -> should
+            # be True for default.
+            hl.Struct(lof_info="GERP_DIST:-1.5,OTHER:value"),
+            # Case 8: Empty lof_info -> should be False for default.
+            hl.Struct(lof_info=""),
+        ]
+
+    def test_default_cutoff(self, sample_csq_structs):
+        """Test the function with default cutoff of 0.0."""
+        ht = hl.Table.parallelize(
+            [{"csq": csq} for csq in sample_csq_structs],
+            hl.tstruct(csq=hl.tstruct(lof_info=hl.tstr)),
+        )
+
+        # Apply the function.
+        ht = ht.annotate(end_trunc=get_loftee_end_trunc_filter_expr(ht.csq))
+
+        # Collect results
+        results = ht.collect()
+
+        # Expected results for default cutoff (0.0).
+        expected = [True, False, False, False, False, False, True, False]
+
+        assert [r.end_trunc for r in results] == expected
+
+    def test_custom_cutoff_positive(self, sample_csq_structs):
+        """Test the function with a positive cutoff."""
+        ht = hl.Table.parallelize(
+            [{"csq": csq} for csq in sample_csq_structs],
+            hl.tstruct(csq=hl.tstruct(lof_info=hl.tstr)),
+        )
+
+        # Apply the function with cutoff 1.0.
+        ht = ht.annotate(
+            end_trunc=get_loftee_end_trunc_filter_expr(ht.csq, gerp_dist_cutoff=1.0)
+        )
+
+        # Collect results
+        results = ht.collect()
+
+        # Expected results for cutoff 1.0.
+        expected = [True, False, False, False, True, True, True, True]
+
+        assert [r.end_trunc for r in results] == expected
+
+    def test_custom_cutoff_negative(self, sample_csq_structs):
+        """Test the function with a negative cutoff."""
+        ht = hl.Table.parallelize(
+            [{"csq": csq} for csq in sample_csq_structs],
+            hl.tstruct(csq=hl.tstruct(lof_info=hl.tstr)),
+        )
+
+        # Apply the function with cutoff -1.0.
+        ht = ht.annotate(
+            end_trunc=get_loftee_end_trunc_filter_expr(ht.csq, gerp_dist_cutoff=-1.0)
+        )
+
+        # Collect results
+        results = ht.collect()
+
+        # Expected results for cutoff -1.0.
+        expected = [True, False, False, False, False, False, True, False]
+
+        assert [r.end_trunc for r in results] == expected
+
+
+class TestUpdateLofteeEndTruncFilter:
+    """Test the update_loftee_end_trunc_filter function."""
+
+    @pytest.fixture
+    def sample_csq_structs_with_filters(self):
+        """Fixture to create sample consequence structs with lof_filter and lof annotations."""
+        return [
+            # Case 1: Should add END_TRUNC filter.
+            hl.Struct(
+                lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL",
+                lof_filter="SINGLE_EXON",
+                lof="HC",
+            ),
+            # Case 2: Should not add END_TRUNC filter.
+            hl.Struct(
+                lof_info="GERP_DIST:1.5,50_BP_RULE:PASS",
+                lof_filter="SINGLE_EXON",
+                lof="HC",
+            ),
+            # Case 3: Should remove existing END_TRUNC filter.
+            hl.Struct(
+                lof_info="GERP_DIST:1.0,50_BP_RULE:PASS",
+                lof_filter="SINGLE_EXON,END_TRUNC",
+                lof="LC",
+            ),
+            # Case 4: Should add END_TRUNC.
+            hl.Struct(
+                lof_info="GERP_DIST:-1.0,50_BP_RULE:FAIL", lof_filter="", lof="HC"
+            ),
+            # Case 5: Missing lof_filter.
+            hl.Struct(
+                lof_info="GERP_DIST:-1.5,50_BP_RULE:FAIL", lof_filter=None, lof="HC"
+            ),
+        ]
+
+    def test_update_single_struct(self, sample_csq_structs_with_filters):
+        """Test updating a single consequence struct."""
+        ht = hl.Table.parallelize(
+            [{"csq": csq} for csq in sample_csq_structs_with_filters],
+            hl.tstruct(
+                csq=hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
+            ),
+        )
+
+        # Apply the function.
+        ht = ht.annotate(updated_csq=update_loftee_end_trunc_filter(ht.csq))
+
+        # Collect results.
+        results = ht.collect()
+
+        # Check results.
+        assert results[0].updated_csq.lof_filter == "END_TRUNC,SINGLE_EXON"
+        assert results[0].updated_csq.lof == "LC"
+
+        # Still LC because filter is not empty.
+        assert results[1].updated_csq.lof_filter == "SINGLE_EXON"
+        assert results[1].updated_csq.lof == "LC"
+
+        # Still LC because filter is not empty.
+        assert results[2].updated_csq.lof_filter == "SINGLE_EXON"
+        assert results[2].updated_csq.lof == "LC"
+
+        assert results[3].updated_csq.lof_filter == "END_TRUNC"
+        assert results[3].updated_csq.lof == "LC"
+
+        assert results[4].updated_csq.lof_filter == "END_TRUNC"
+        assert results[4].updated_csq.lof == "LC"
+
+    def test_update_array_of_structs(self, sample_csq_structs_with_filters):
+        """Test updating an array of consequence structs."""
+        # Create a table with arrays of consequences.
+        ht = hl.Table.parallelize(
+            [
+                {"csqs": sample_csq_structs_with_filters[:2]},
+                {"csqs": sample_csq_structs_with_filters[2:]},
+            ],
+            hl.tstruct(
+                csqs=hl.tarray(
+                    hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
+                )
+            ),
+        )
+
+        # Apply the function.
+        ht = ht.annotate(updated_csqs=update_loftee_end_trunc_filter(ht.csqs))
+
+        # Collect results.
+        results = ht.collect()
+
+        # Check first array.
+        first_array = results[0].updated_csqs
+        assert first_array[0].lof_filter == "END_TRUNC,SINGLE_EXON"
+        assert first_array[0].lof == "LC"
+        assert first_array[1].lof_filter == "SINGLE_EXON"
+        assert first_array[1].lof == "LC"
+
+        # Check second array.
+        second_array = results[1].updated_csqs
+        assert second_array[0].lof_filter == "SINGLE_EXON"
+        assert second_array[0].lof == "LC"
+        assert second_array[1].lof_filter == "END_TRUNC"
+        assert second_array[1].lof == "LC"
+        assert second_array[2].lof_filter == "END_TRUNC"
+        assert second_array[2].lof == "LC"
+
+    def test_missing_lof_annotation(self):
+        """Test updating when lof annotation is missing."""
+        csq_with_missing_lof = hl.Struct(
+            lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL",
+            lof_filter="SINGLE_EXON",
+            lof=None,
+        )
+
+        ht = hl.Table.parallelize(
+            [{"csq": csq_with_missing_lof}],
+            hl.tstruct(
+                csq=hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
+            ),
+        )
+
+        # Apply the function.
+        ht = ht.annotate(updated_csq=update_loftee_end_trunc_filter(ht.csq))
+
+        # Collect results.
+        results = ht.collect()
+
+        # This case shouldn't happen. If lof_filter is defined, lof should be defined
+        # too. However, we should handle it gracefully by adding END_TRUNC, but
+        # maintaining the lof missingness status.
+        assert results[0].updated_csq.lof_filter == "END_TRUNC,SINGLE_EXON"
+        assert results[0].updated_csq.lof is None
+
+    def test_empty_filter_handling(self):
+        """Test handling of empty and None filters."""
+        test_cases = [
+            hl.Struct(
+                lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL", lof_filter="", lof="HC"
+            ),
+            hl.Struct(
+                lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL", lof_filter=None, lof="HC"
+            ),
+        ]
+
+        ht = hl.Table.parallelize(
+            [{"csq": csq} for csq in test_cases],
+            hl.tstruct(
+                csq=hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
+            ),
+        )
+
+        # Apply the function.
+        ht = ht.annotate(updated_csq=update_loftee_end_trunc_filter(ht.csq))
+
+        # Collect results.
+        results = ht.collect()
+
+        assert results[0].updated_csq.lof_filter == "END_TRUNC"
+        assert results[0].updated_csq.lof == "LC"
+
+        assert results[1].updated_csq.lof_filter == "END_TRUNC"
+        assert results[1].updated_csq.lof == "LC"