Skip to content

Commit 52a49c8

Browse files
authored
Merge pull request #740 from broadinstitute/jg/determine_end_trunc_filter_from_gerp
Add `get_loftee_end_trunc_filter_expr` and `update_loftee_end_trunc_filter` to patch the LOFTEE END_TRUNC filter
2 parents 99358cf + 9b4c9fe commit 52a49c8

File tree

2 files changed

+358
-0
lines changed

2 files changed

+358
-0
lines changed

gnomad/utils/vep.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,3 +1047,103 @@ def explode_by_vep_annotation(
10471047
t = t.explode_rows(t[vep_annotation])
10481048

10491049
return t
1050+
1051+
1052+
def get_loftee_end_trunc_filter_expr(
1053+
csq_expr: hl.expr.StructExpression,
1054+
gerp_dist_cutoff: float = 0.0,
1055+
) -> hl.expr.BooleanExpression:
1056+
"""
1057+
Get the expression for LOFTEE's END_TRUNC filter based on the GERP distance cutoff.
1058+
1059+
The end truncation filter is based on the GERP distance cutoff (`gerp_dist_cutoff`)
1060+
and the 'GERP_DIST' and '50_BP_RULE' annotations in the LOFTEE annotations.
1061+
1062+
True is returned if the GERP distance is less than the cutoff and the '50_BP_RULE'
1063+
annotation is not 'PASS'.
1064+
1065+
The END_TRUNC filter is designed to identify loss-of-function variants that occur
1066+
near the end of transcripts, which are often less likely to cause true loss of
1067+
function.
1068+
1069+
GERP (Genomic Evolutionary Rate Profiling) scores measure evolutionary constraint,
1070+
with positive values indicating conservation and negative values indicating less
1071+
constraint. The default cutoff of 0.0 means that variants in regions with any level
1072+
of evolutionary constraint (GERP >= 0) will NOT be filtered as END_TRUNC, while
1073+
variants in less constrained regions (GERP < 0) will be filtered.
1074+
1075+
The filter also requires that the '50_BP_RULE' annotation is not 'PASS', meaning
1076+
the variant falls within the last 50 base pairs of the transcript where Nonsense
1077+
Mediated Decay (NMD) escape is more likely.
1078+
1079+
:param csq_expr: StructExpression containing the LOFTEE annotation 'lof_info', with
1080+
'GERP_DIST' and '50_BP_RULE' info.
1081+
:param gerp_dist_cutoff: GERP distance cutoff for end truncation. Default is 0.0.
1082+
:return: BooleanExpression for end truncation annotation.
1083+
"""
1084+
lof_info_expr = hl.dict(
1085+
csq_expr.lof_info.split(",")
1086+
.map(lambda x: x.split(":"))
1087+
.map(lambda x: (x[0], hl.or_missing(x.length() > 1, x[1])))
1088+
)
1089+
1090+
end_trunc_expr = hl.or_else(
1091+
hl.float64(lof_info_expr.get("GERP_DIST", "0")) < gerp_dist_cutoff, False
1092+
) & hl.or_else(lof_info_expr.get("50_BP_RULE", "") != "PASS", False)
1093+
1094+
return end_trunc_expr
1095+
1096+
1097+
def update_loftee_end_trunc_filter(
1098+
csq_expr: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
1099+
gerp_dist_cutoff: float = 0.0,
1100+
) -> hl.expr.StructExpression:
1101+
"""
1102+
Update the LOFTEE end truncation filter in the input Struct or Array of Structs.
1103+
1104+
The LOFTEE end truncation filter is updated based on the GERP distance cutoff
1105+
(`gerp_dist_cutoff`) using `get_loftee_end_trunc_filter_expr`.
1106+
1107+
The 'lof_filter' field in the input Struct or Array of Structs is updated to include
1108+
'END_TRUNC' if the end truncation filter is met, and 'END_TRUNC' is removed if the
1109+
end truncation filter is not met.
1110+
1111+
Then the 'lof' field in the input Struct or Array of Structs is updated to 'HC' if
1112+
the new 'lof_filter' is missing, and 'LC' if it's not missing.
1113+
1114+
:param csq_expr: Struct or Array of Structs containing the LOFTEE annotations.
1115+
:param gerp_dist_cutoff: GERP distance cutoff for end truncation. Default is 0.0.
1116+
:return: Struct or Array of Structs with updated LOFTEE end truncation filter
1117+
annotation.
1118+
"""
1119+
1120+
def _update_csq_struct(csq_expr: hl.expr.StructExpression):
1121+
"""
1122+
Update the LOFTEE end truncation filter in the input Struct.
1123+
1124+
:param csq_expr: Struct containing the LOFTEE annotations.
1125+
:return: Consequence Struct with updated LOFTEE annotations.
1126+
"""
1127+
end_trunc_expr = get_loftee_end_trunc_filter_expr(csq_expr, gerp_dist_cutoff)
1128+
filter_parts = csq_expr.lof_filter.split(",")
1129+
filter_expr = hl.or_else(
1130+
hl.set(filter_parts.filter(lambda x: x != "")), hl.empty_set(hl.tstr)
1131+
)
1132+
filter_expr = hl.if_else(
1133+
end_trunc_expr,
1134+
filter_expr.add("END_TRUNC"),
1135+
filter_expr.remove("END_TRUNC"),
1136+
)
1137+
filter_expr = hl.or_missing(filter_expr.length() > 0, hl.delimit(filter_expr))
1138+
1139+
lof_expr = hl.or_missing(
1140+
hl.is_defined(csq_expr.lof),
1141+
hl.if_else(hl.is_missing(filter_expr), "HC", "LC"),
1142+
)
1143+
1144+
return hl.struct(lof_filter=filter_expr, lof=lof_expr)
1145+
1146+
if isinstance(csq_expr, hl.expr.StructExpression):
1147+
return csq_expr.annotate(**_update_csq_struct(csq_expr))
1148+
else:
1149+
return csq_expr.map(lambda x: x.annotate(**_update_csq_struct(x)))

tests/utils/test_vep.py

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
"""Tests for the VEP utility module."""
2+
3+
import hail as hl
4+
import pytest
5+
6+
from gnomad.utils.vep import (
7+
get_loftee_end_trunc_filter_expr,
8+
update_loftee_end_trunc_filter,
9+
)
10+
11+
12+
class TestGetLofteeEndTruncFilterExpr:
13+
"""Test the get_loftee_end_trunc_filter_expr function."""
14+
15+
@pytest.fixture
16+
def sample_csq_structs(self):
17+
"""Fixture to create sample consequence structs with different LOFTEE annotations."""
18+
return [
19+
# Case 1: GERP_DIST < 0, 50_BP_RULE != PASS -> should be True for default.
20+
hl.Struct(lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL,OTHER:value"),
21+
# Case 2: GERP_DIST >= 0, 50_BP_RULE != PASS -> should be False for default.
22+
hl.Struct(lof_info="GERP_DIST:1.5,50_BP_RULE:FAIL,OTHER:value"),
23+
# Case 3: GERP_DIST < 0, 50_BP_RULE = PASS -> should be False for default.
24+
hl.Struct(lof_info="GERP_DIST:-1.0,50_BP_RULE:PASS,OTHER:value"),
25+
# Case 4: GERP_DIST >= 0, 50_BP_RULE = PASS -> should be False for default.
26+
hl.Struct(lof_info="GERP_DIST:0.5,50_BP_RULE:PASS,OTHER:value"),
27+
# Case 5: GERP_DIST >= 0, 50_BP_RULE != PASS -> should be False for default.
28+
hl.Struct(lof_info="GERP_DIST:0.5,50_BP_RULE:FAIL,OTHER:value"),
29+
# Case 6: Missing GERP_DIST (defaults to 0), 50_BP_RULE != PASS -> should
30+
# be False for default.
31+
hl.Struct(lof_info="50_BP_RULE:FAIL,OTHER:value"),
32+
# Case 7: GERP_DIST < 0, missing 50_BP_RULE (defaults to empty) -> should
33+
# be True for default.
34+
hl.Struct(lof_info="GERP_DIST:-1.5,OTHER:value"),
35+
# Case 8: Empty lof_info -> should be False for default.
36+
hl.Struct(lof_info=""),
37+
]
38+
39+
def test_default_cutoff(self, sample_csq_structs):
40+
"""Test the function with default cutoff of 0.0."""
41+
ht = hl.Table.parallelize(
42+
[{"csq": csq} for csq in sample_csq_structs],
43+
hl.tstruct(csq=hl.tstruct(lof_info=hl.tstr)),
44+
)
45+
46+
# Apply the function.
47+
ht = ht.annotate(end_trunc=get_loftee_end_trunc_filter_expr(ht.csq))
48+
49+
# Collect results
50+
results = ht.collect()
51+
52+
# Expected results for default cutoff (0.0).
53+
expected = [True, False, False, False, False, False, True, False]
54+
55+
assert [r.end_trunc for r in results] == expected
56+
57+
def test_custom_cutoff_positive(self, sample_csq_structs):
58+
"""Test the function with a positive cutoff."""
59+
ht = hl.Table.parallelize(
60+
[{"csq": csq} for csq in sample_csq_structs],
61+
hl.tstruct(csq=hl.tstruct(lof_info=hl.tstr)),
62+
)
63+
64+
# Apply the function with cutoff 1.0.
65+
ht = ht.annotate(
66+
end_trunc=get_loftee_end_trunc_filter_expr(ht.csq, gerp_dist_cutoff=1.0)
67+
)
68+
69+
# Collect results
70+
results = ht.collect()
71+
72+
# Expected results for cutoff 1.0.
73+
expected = [True, False, False, False, True, True, True, True]
74+
75+
assert [r.end_trunc for r in results] == expected
76+
77+
def test_custom_cutoff_negative(self, sample_csq_structs):
78+
"""Test the function with a negative cutoff."""
79+
ht = hl.Table.parallelize(
80+
[{"csq": csq} for csq in sample_csq_structs],
81+
hl.tstruct(csq=hl.tstruct(lof_info=hl.tstr)),
82+
)
83+
84+
# Apply the function with cutoff -1.0.
85+
ht = ht.annotate(
86+
end_trunc=get_loftee_end_trunc_filter_expr(ht.csq, gerp_dist_cutoff=-1.0)
87+
)
88+
89+
# Collect results
90+
results = ht.collect()
91+
92+
# Expected results for cutoff -1.0.
93+
expected = [True, False, False, False, False, False, True, False]
94+
95+
assert [r.end_trunc for r in results] == expected
96+
97+
98+
class TestUpdateLofteeEndTruncFilter:
99+
"""Test the update_loftee_end_trunc_filter function."""
100+
101+
@pytest.fixture
102+
def sample_csq_structs_with_filters(self):
103+
"""Fixture to create sample consequence structs with lof_filter and lof annotations."""
104+
return [
105+
# Case 1: Should add END_TRUNC filter.
106+
hl.Struct(
107+
lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL",
108+
lof_filter="SINGLE_EXON",
109+
lof="HC",
110+
),
111+
# Case 2: Should not add END_TRUNC filter.
112+
hl.Struct(
113+
lof_info="GERP_DIST:1.5,50_BP_RULE:PASS",
114+
lof_filter="SINGLE_EXON",
115+
lof="HC",
116+
),
117+
# Case 3: Should remove existing END_TRUNC filter.
118+
hl.Struct(
119+
lof_info="GERP_DIST:1.0,50_BP_RULE:PASS",
120+
lof_filter="SINGLE_EXON,END_TRUNC",
121+
lof="LC",
122+
),
123+
# Case 4: Should add END_TRUNC.
124+
hl.Struct(
125+
lof_info="GERP_DIST:-1.0,50_BP_RULE:FAIL", lof_filter="", lof="HC"
126+
),
127+
# Case 5: Missing lof_filter.
128+
hl.Struct(
129+
lof_info="GERP_DIST:-1.5,50_BP_RULE:FAIL", lof_filter=None, lof="HC"
130+
),
131+
]
132+
133+
def test_update_single_struct(self, sample_csq_structs_with_filters):
134+
"""Test updating a single consequence struct."""
135+
ht = hl.Table.parallelize(
136+
[{"csq": csq} for csq in sample_csq_structs_with_filters],
137+
hl.tstruct(
138+
csq=hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
139+
),
140+
)
141+
142+
# Apply the function.
143+
ht = ht.annotate(updated_csq=update_loftee_end_trunc_filter(ht.csq))
144+
145+
# Collect results.
146+
results = ht.collect()
147+
148+
# Check results.
149+
assert results[0].updated_csq.lof_filter == "END_TRUNC,SINGLE_EXON"
150+
assert results[0].updated_csq.lof == "LC"
151+
152+
# Still LC because filter is not empty.
153+
assert results[1].updated_csq.lof_filter == "SINGLE_EXON"
154+
assert results[1].updated_csq.lof == "LC"
155+
156+
# Still LC because filter is not empty.
157+
assert results[2].updated_csq.lof_filter == "SINGLE_EXON"
158+
assert results[2].updated_csq.lof == "LC"
159+
160+
assert results[3].updated_csq.lof_filter == "END_TRUNC"
161+
assert results[3].updated_csq.lof == "LC"
162+
163+
assert results[4].updated_csq.lof_filter == "END_TRUNC"
164+
assert results[4].updated_csq.lof == "LC"
165+
166+
def test_update_array_of_structs(self, sample_csq_structs_with_filters):
167+
"""Test updating an array of consequence structs."""
168+
# Create a table with arrays of consequences.
169+
ht = hl.Table.parallelize(
170+
[
171+
{"csqs": sample_csq_structs_with_filters[:2]},
172+
{"csqs": sample_csq_structs_with_filters[2:]},
173+
],
174+
hl.tstruct(
175+
csqs=hl.tarray(
176+
hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
177+
)
178+
),
179+
)
180+
181+
# Apply the function.
182+
ht = ht.annotate(updated_csqs=update_loftee_end_trunc_filter(ht.csqs))
183+
184+
# Collect results.
185+
results = ht.collect()
186+
187+
# Check first array.
188+
first_array = results[0].updated_csqs
189+
assert first_array[0].lof_filter == "END_TRUNC,SINGLE_EXON"
190+
assert first_array[0].lof == "LC"
191+
assert first_array[1].lof_filter == "SINGLE_EXON"
192+
assert first_array[1].lof == "LC"
193+
194+
# Check second array.
195+
second_array = results[1].updated_csqs
196+
assert second_array[0].lof_filter == "SINGLE_EXON"
197+
assert second_array[0].lof == "LC"
198+
assert second_array[1].lof_filter == "END_TRUNC"
199+
assert second_array[1].lof == "LC"
200+
assert second_array[2].lof_filter == "END_TRUNC"
201+
assert second_array[2].lof == "LC"
202+
203+
def test_missing_lof_annotation(self):
204+
"""Test updating when lof annotation is missing."""
205+
csq_with_missing_lof = hl.Struct(
206+
lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL",
207+
lof_filter="SINGLE_EXON",
208+
lof=None,
209+
)
210+
211+
ht = hl.Table.parallelize(
212+
[{"csq": csq_with_missing_lof}],
213+
hl.tstruct(
214+
csq=hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
215+
),
216+
)
217+
218+
# Apply the function.
219+
ht = ht.annotate(updated_csq=update_loftee_end_trunc_filter(ht.csq))
220+
221+
# Collect results.
222+
results = ht.collect()
223+
224+
# This case shouldn't happen. If lof_filter is defined, lof should be defined
225+
# too. However, we should handle it gracefully by adding END_TRUNC, but
226+
# maintaining the lof missingness status.
227+
assert results[0].updated_csq.lof_filter == "END_TRUNC,SINGLE_EXON"
228+
assert results[0].updated_csq.lof is None
229+
230+
def test_empty_filter_handling(self):
231+
"""Test handling of empty and None filters."""
232+
test_cases = [
233+
hl.Struct(
234+
lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL", lof_filter="", lof="HC"
235+
),
236+
hl.Struct(
237+
lof_info="GERP_DIST:-2.5,50_BP_RULE:FAIL", lof_filter=None, lof="HC"
238+
),
239+
]
240+
241+
ht = hl.Table.parallelize(
242+
[{"csq": csq} for csq in test_cases],
243+
hl.tstruct(
244+
csq=hl.tstruct(lof_info=hl.tstr, lof_filter=hl.tstr, lof=hl.tstr)
245+
),
246+
)
247+
248+
# Apply the function.
249+
ht = ht.annotate(updated_csq=update_loftee_end_trunc_filter(ht.csq))
250+
251+
# Collect results.
252+
results = ht.collect()
253+
254+
assert results[0].updated_csq.lof_filter == "END_TRUNC"
255+
assert results[0].updated_csq.lof == "LC"
256+
257+
assert results[1].updated_csq.lof_filter == "END_TRUNC"
258+
assert results[1].updated_csq.lof == "LC"

0 commit comments

Comments
 (0)