Skip to content

Commit 48a1190

Browse files
committed
add regression report generator
ghstack-source-id: 7a68457 Pull-Request: #7094
1 parent 0e9d37e commit 48a1190

File tree

4 files changed

+306
-14
lines changed

4 files changed

+306
-14
lines changed
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
import logging
2+
import math
3+
import statistics
4+
from typing import Any, Counter, Dict, List, Literal, Optional, Tuple, TypedDict
5+
6+
from common.benchmark_time_series_api_model import BenchmarkTimeSeriesApiData
7+
from common.config_model import BenchmarkConfig, RegressionPolicy
8+
from dateutil.parser import isoparse
9+
10+
11+
logger = logging.getLogger()
12+
13+
RegressionClassifyLabel = Literal[
14+
"regression", "suspicious", "no_regression", "insufficient_data"
15+
]
16+
17+
18+
class BaselineItem(TypedDict):
19+
group_info: Dict[str, Any]
20+
value: float
21+
22+
23+
class BenchmarkValueItem(TypedDict):
24+
group_info: Dict[str, Any]
25+
values: List[Dict[str, Any]]
26+
27+
28+
class PerGroupResult(TypedDict, total=True):
29+
group_info: Dict[str, Any]
30+
baseline: Optional[float]
31+
points: List[Any]
32+
label: RegressionClassifyLabel
33+
policy: Optional["RegressionPolicy"]
34+
35+
36+
def percentile(values: list[float], q: float):
37+
v = sorted(values)
38+
k = (len(v) - 1) * q
39+
f = math.floor(k)
40+
c = math.ceil(k)
41+
if f == c:
42+
return v[int(k)]
43+
return v[f] + (v[c] - v[f]) * (k - f)
44+
45+
46+
class BenchmarkRegressionReportGenerator:
47+
def __init__(
48+
self,
49+
config: BenchmarkConfig,
50+
latest_ts: BenchmarkTimeSeriesApiData,
51+
baseline_ts: BenchmarkTimeSeriesApiData,
52+
) -> None:
53+
self.metric_policies = config.policy.metrics
54+
self.latest_ts = self._to_data_map(latest_ts)
55+
self.baseline_raw = self._to_data_map(baseline_ts)
56+
57+
def generate(self) -> Tuple[List[PerGroupResult], Dict[str, Any]]:
58+
return self.detect_regressions_with_policies(
59+
self.baseline_raw,
60+
self.latest_ts,
61+
metric_policies=self.metric_policies,
62+
)
63+
64+
def detect_regressions_with_policies(
65+
self,
66+
baseline_map: Dict[tuple, BenchmarkValueItem],
67+
dp_map: Dict[tuple, BenchmarkValueItem],
68+
*,
69+
metric_policies: Dict[str, RegressionPolicy],
70+
min_points: int = 2,
71+
) -> Tuple[List[PerGroupResult], Dict[str, Any]]:
72+
"""
73+
For each dp_map:
74+
- choose policy based on targeting metric from group_info['metric'] (ex passrate, geomean ..)
75+
- calculate baseline value based on policy.baseline_aggregation (ex mean, p90, max, min, latest, p50, p95)
76+
- use baseline value to generate violation flag list for each point, using policy.is_violation(value, baseline)
77+
- classify with labels to detect regression, using self.classify_flags(flags, min_points)
78+
Returns a list of Regression result {group_info, baseline, values, flags, label, policy}
79+
"""
80+
logger.info("Generating regression results ...")
81+
results: List[PerGroupResult] = []
82+
83+
for key in sorted(dp_map.keys()):
84+
cur_item = dp_map.get(key)
85+
gi = cur_item["group_info"] if cur_item else {}
86+
points: List[Any] = cur_item["values"] if cur_item else []
87+
88+
base_item = baseline_map.get(key)
89+
if not base_item:
90+
logger.warning("Skip. No baseline item found for %s", gi)
91+
results.append(
92+
PerGroupResult(
93+
group_info=gi,
94+
baseline=None,
95+
points=[],
96+
label="insufficient_data",
97+
policy=None,
98+
)
99+
)
100+
continue
101+
policy = self._resolve_policy(metric_policies, gi.get("metric", ""))
102+
if not policy:
103+
logger.warning("No policy for %s", gi)
104+
results.append(
105+
PerGroupResult(
106+
group_info=gi,
107+
baseline=None,
108+
points=[],
109+
label="insufficient_data",
110+
policy=None,
111+
)
112+
)
113+
continue
114+
115+
baseline_aggre_mode = policy.baseline_aggregation
116+
baseline_value = self._get_baseline(base_item, baseline_aggre_mode)
117+
if baseline_value is None or len(points) == 0:
118+
logger.warning(
119+
"baseline_value is %s, len(points) == %s",
120+
baseline_value,
121+
len(points),
122+
)
123+
results.append(
124+
PerGroupResult(
125+
group_info=gi,
126+
baseline=None,
127+
points=[],
128+
label="insufficient_data",
129+
policy=policy,
130+
)
131+
)
132+
continue
133+
134+
# Per-point violations (True = regression)
135+
flags: List[bool] = [
136+
policy.is_violation(p["value"], baseline_value["value"]) for p in points
137+
]
138+
label = self.classify_flags(flags, min_points=min_points)
139+
140+
enriched_points = [{**p, "flag": f} for p, f in zip(points, flags)]
141+
results.append(
142+
PerGroupResult(
143+
group_info=gi,
144+
baseline=baseline_value["value"],
145+
points=enriched_points,
146+
label=label,
147+
policy=policy,
148+
)
149+
)
150+
151+
logger.info("Done. Generated %s regression results", len(results))
152+
summary = self.summarize_label_counts(results)
153+
return results, summary
154+
155+
def summarize_label_counts(self, results: list[PerGroupResult]):
156+
counts = Counter(self._label_str(r["label"]) for r in results)
157+
total_count = len(results)
158+
return {
159+
"total_count": total_count,
160+
"regression_count": counts.get("regression", 0),
161+
"suspicious_count": counts.get("suspicious", 0),
162+
"no_regression_count": counts.get("no_regression", 0),
163+
"insufficient_data_count": counts.get("insufficient_data", 0),
164+
"is_regression": int(counts.get("regression", 0) > 0),
165+
}
166+
167+
def _label_str(self, x) -> str:
168+
# Robust: works for str or Enum-like labels
169+
if isinstance(x, str):
170+
return x.lower()
171+
if hasattr(x, "value"):
172+
v = x.value
173+
return (v if isinstance(v, str) else str(v)).lower()
174+
return str(x).lower()
175+
176+
def _to_data_map(
177+
self, data: "BenchmarkTimeSeriesApiData", field: str = "value"
178+
) -> Dict[tuple, BenchmarkValueItem]:
179+
result: Dict[tuple, BenchmarkValueItem] = {}
180+
for ts_group in data.time_series:
181+
group_keys = tuple(sorted(ts_group.group_info.items()))
182+
points: List[Dict[str, Any]] = []
183+
for d in sorted(
184+
ts_group.data, key=lambda d: isoparse(d["granularity_bucket"])
185+
):
186+
if field not in d:
187+
continue
188+
points.append(
189+
{
190+
"value": float(d[field]),
191+
"commit": d.get("commit"),
192+
"branch": d.get("branch"),
193+
"timestamp": isoparse(d["granularity_bucket"]),
194+
}
195+
)
196+
result[group_keys] = {
197+
"group_info": ts_group.group_info,
198+
"values": points,
199+
}
200+
return result
201+
202+
def _get_baseline(
203+
self,
204+
data: BenchmarkValueItem,
205+
mode: str = "mean",
206+
field: str = "value",
207+
) -> Optional[BaselineItem]:
208+
"""
209+
calculate the baseline value based on the mode
210+
mode: mean, p90, max, min, latest, p50, p95
211+
"""
212+
values = [float(d[field]) for d in data["values"] if field in d]
213+
if not values:
214+
return None
215+
216+
if mode == "mean":
217+
val = statistics.fmean(values)
218+
elif mode == "p90":
219+
val = percentile(values, 0.9)
220+
elif mode == "max":
221+
val = max(values)
222+
elif mode == "min":
223+
val = min(values)
224+
elif mode == "latest":
225+
val = values[-1]
226+
elif mode == "earliest":
227+
val = values[0]
228+
elif mode == "p50":
229+
val = percentile(values, 0.5)
230+
elif mode == "p95":
231+
val = percentile(values, 0.95)
232+
else:
233+
logger.warning("Unknown mode: %s", mode)
234+
return None
235+
result: BaselineItem = {
236+
"group_info": data["group_info"],
237+
"value": val,
238+
}
239+
return result
240+
241+
def classify_flags(
242+
self, flags: list[bool], min_points: int = 3
243+
) -> RegressionClassifyLabel:
244+
"""
245+
Classify a sequence of boolean flags to detect regression.
246+
247+
- regression: last run has >= 2 consecutive True values
248+
- suspicious: there is a run of >= 3 consecutive True values, but not at the end
249+
- no_regression: all other cases
250+
- insufficient_data: not enough data points (< min_points)
251+
252+
Special case:
253+
- If min_points == 1, then just look at the last flag:
254+
True -> regression
255+
False -> no_regression
256+
"""
257+
n = len(flags)
258+
if n == 0:
259+
return "insufficient_data"
260+
261+
if min_points == 1:
262+
return "regression" if flags[-1] else "no_regression"
263+
264+
if n < min_points:
265+
return "insufficient_data"
266+
267+
# trailing run length
268+
t = 0
269+
for v in reversed(flags):
270+
if v:
271+
t += 1
272+
else:
273+
break
274+
if t >= 2:
275+
return "regression"
276+
277+
# longest run anywhere
278+
longest = cur = 0
279+
for v in flags:
280+
cur = cur + 1 if v else 0
281+
longest = max(longest, cur)
282+
283+
if longest >= 3:
284+
return "suspicious"
285+
286+
return "no_regression"
287+
288+
def _resolve_policy(
289+
self,
290+
metric_policies: Dict[str, RegressionPolicy],
291+
metric: str,
292+
) -> Optional[RegressionPolicy]:
293+
if not metric:
294+
return None
295+
m = metric.lower()
296+
return metric_policies.get(m)

aws/lambda/benchmark_regression_summary_report/lambda_function.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,20 @@
11
#!/usr/bin/env python
22
import argparse
33
import datetime as dt
4-
<<<<<<< HEAD
54
import json
6-
=======
7-
>>>>>>> 52a5ee66f (fix bug2)
85
import logging
96
import os
107
import threading
118
import time
129
from concurrent.futures import as_completed, ThreadPoolExecutor
13-
<<<<<<< HEAD
14-
import time
15-
=======
16-
>>>>>>> 52a5ee66f (fix bug2)
1710
from typing import Any, Optional
1811

1912
import clickhouse_connect
2013
import requests
2114
from common.benchmark_time_series_api_model import BenchmarkTimeSeriesApiResponse
2215
from common.config import get_benchmark_regression_config
2316
from common.config_model import BenchmarkApiSource, BenchmarkConfig, Frequency
24-
<<<<<<< HEAD
2517
from common.regression_utils import BenchmarkRegressionReportGenerator
26-
=======
27-
>>>>>>> 52a5ee66f (fix bug2)
2818
from dateutil.parser import isoparse
2919

3020

@@ -54,7 +44,7 @@ def get_clickhouse_client(
5444
host: str, user: str, password: str
5545
) -> clickhouse_connect.driver.client.Client:
5646
# for local testing only, disable SSL verification
57-
# return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True, verify=False)
47+
return clickhouse_connect.get_client( host=host, user=user, password=password, secure=True, verify=False)
5848

5949
return clickhouse_connect.get_client(
6050
host=host, user=user, password=password, secure=True
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
clickhouse_connect==0.8.5
2+
boto3==1.35.33
3+
PyGithub==1.59.0
4+
python-dateutil==2.8.2
5+
PyYAML==6.0.1
6+
Jinja2==3.1.2

clickhouse_db_schema/benchmark_regression_summary_report/schema.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CREATE TABLE benchmark.benchmark_regression_report
1+
CREATE TABLE fortesting.benchmark_regression_report
22
(
33
`id` UUID DEFAULT generateUUIDv4(),
44
`report_id` String, -- unique id for the report config
@@ -15,7 +15,7 @@ CREATE TABLE benchmark.benchmark_regression_report
1515
`report` String DEFAULT '{}'
1616
)
1717
ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
18-
PARTITION BY toYYYYMM(report_date)
18+
PARTITION BY toYYYYMM(created_at)
1919
ORDER BY
2020
(
2121
report_id,
@@ -24,7 +24,7 @@ ORDER BY
2424
last_record_ts,
2525
last_record_commit,
2626
created_at,
27-
repo
27+
repo,
2828
id
2929
)
3030
TTL created_at + toIntervalYear(10)

0 commit comments

Comments
 (0)