Skip to content

Commit efacd29

Browse files
authored
[Benchmark Regression] Add Report Level (#7138)
add a configuration to decide what level of data should we store in db for regregssion benchmark lowest from highest
1 parent 23b5b6b commit efacd29

File tree

4 files changed

+115
-59
lines changed

4 files changed

+115
-59
lines changed

aws/lambda/benchmark_regression_summary_report/common/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Policy,
88
RangeConfig,
99
RegressionPolicy,
10+
ReportConfig,
1011
)
1112

1213

@@ -37,7 +38,6 @@
3738
"startTime": "{{ startTime }}",
3839
"stopTime": "{{ stopTime }}",
3940
"suites": ["torchbench", "huggingface", "timm_models"],
40-
"workflowId": 0,
4141
"branches": ["main"]
4242
}
4343
}
@@ -79,6 +79,9 @@
7979
"issue": "7081",
8080
},
8181
),
82+
report_config=ReportConfig(
83+
report_level="no_regression",
84+
),
8285
)
8386

8487
BENCHMARK_REGRESSION_CONFIG = BenchmarkRegressionConfigBook(

aws/lambda/benchmark_regression_summary_report/common/config_model.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import dataclasses
34
import json
45
from dataclasses import dataclass, field
56
from datetime import timedelta
@@ -223,6 +224,56 @@ def get_github_notification_config(self) -> Optional[GitHubNotificationConfig]:
223224
return GitHubNotificationConfig.from_dict(self.notification_config)
224225

225226

227+
ReportSeverity = Literal[
228+
"none",
229+
"no_regression",
230+
"insufficient_data",
231+
"suspicious",
232+
"regression",
233+
"unknown",
234+
]
235+
236+
237+
# Mapping from severity label → numeric level.
238+
# Higher numbers mean more severe, used to compare and filter results before DB upload.
239+
#
240+
# Effects on DB upload:
241+
# - "unknown": (-1) fallback for invalid input → excluded from DB
242+
# - "none": (0) no report generated → results field is empty, nothing uploaded
243+
# - "no_regression": (1) clean report → all results with severity >= 1 are uploaded
244+
# - "insufficient_data": (2) weak signal → uploaded, also includes suspicious/regression
245+
# - "suspicious": (3) medium signal → uploaded, also includes regression
246+
# - "regression": (4) strongest severity → always uploaded
247+
#
248+
# This allows filtering: e.g., if threshold = 2, DB upload includes
249+
# "insufficient_data", "suspicious", and "regression" but not "none" or "no_regression".
250+
SEVERITY_ORDER: dict[ReportSeverity, int] = {
251+
"unknown": -1, # fallback (bad/invalid input)
252+
"none": 0, # no report generated, leads to no details uploaded to db in field results
253+
"no_regression": 1, # no regression, leads to includes all the data with same/higher severity
254+
"insufficient_data": 2, # weak signal
255+
"suspicious": 3, # medium signal
256+
"regression": 4, # strongest severity
257+
}
258+
259+
260+
@dataclass
261+
class ReportConfig:
262+
"""
263+
decide what to include in db summary report
264+
report_level: lowest level of regression to store in db
265+
"""
266+
267+
report_level: ReportSeverity = "regression"
268+
269+
def get_severity_map(self) -> dict[ReportSeverity, int]:
270+
# shadow copy is safe since this is a flat dict.
271+
return SEVERITY_ORDER.copy()
272+
273+
def get_order(self) -> int:
274+
return self.get_severity_map().get(self.report_level, -1)
275+
276+
226277
# -------- Top-level benchmark regression config --------
227278
@dataclass
228279
class BenchmarkConfig:
@@ -242,6 +293,7 @@ class BenchmarkConfig:
242293
source: BenchmarkApiSource
243294
policy: Policy
244295
hud_info: Optional[dict[str, Any]] = None
296+
report_config: ReportConfig = field(default_factory=ReportConfig)
245297

246298

247299
@dataclass
@@ -253,3 +305,13 @@ def __getitem__(self, key: str) -> BenchmarkConfig:
253305
if not config:
254306
raise KeyError(f"Config {key} not found")
255307
return config
308+
309+
310+
def to_dict(x: Any) -> Any:
311+
if dataclasses.is_dataclass(x):
312+
return {f.name: to_dict(getattr(x, f.name)) for f in dataclasses.fields(x)}
313+
if isinstance(x, dict):
314+
return {k: to_dict(v) for k, v in x.items()}
315+
if isinstance(x, (list, tuple, set)):
316+
return [to_dict(v) for v in x]
317+
return x # primitive or already JSON-serializable

aws/lambda/benchmark_regression_summary_report/common/report_manager.py

Lines changed: 46 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
1-
import dataclasses
21
import datetime as dt
32
import json
43
import logging
54
import uuid
65
from typing import Any, Dict
76

87
import clickhouse_connect
9-
from common.config_model import BenchmarkConfig, Frequency
10-
from common.regression_utils import (
11-
BenchmarkRegressionReport,
12-
get_regression_status,
13-
PerGroupResult,
14-
)
8+
from common.config_model import BenchmarkConfig, ReportConfig, to_dict
9+
from common.regression_utils import BenchmarkRegressionReport, get_regression_status
1510
from jinja2 import Template
1611

1712

@@ -71,9 +66,6 @@
7166
"""
7267

7368

74-
logger = logging.getLogger()
75-
76-
7769
class ReportManager:
7870
"""
7971
handles db insertion and notification processing
@@ -91,27 +83,28 @@ def __init__(
9183
):
9284
self.is_dry_run = is_dry_run
9385

94-
self.report = regression_report
86+
self.raw_report = regression_report
87+
9588
self.config_id = config.id
9689
self.config = config
97-
9890
self.type = type
9991
self.repo = repo
10092
self.db_table_name = db_table_name
10193

10294
self.id = str(uuid.uuid4())
10395

10496
# extract latest meta data from report
105-
self.baseline = self.report["baseline_meta_data"]
106-
self.target = self.report["new_meta_data"]
97+
self.baseline = self.raw_report["baseline_meta_data"]
98+
self.target = self.raw_report["new_meta_data"]
10799
self.target_latest_commit = self.target["end"]["commit"]
108100
self.target_latest_ts_str = self.target["end"]["timestamp"]
109-
self.status = get_regression_status(self.report["summary"])
101+
self.status = get_regression_status(self.raw_report["summary"])
110102

111103
self.report_data = self._to_report_data(
112104
config_id=config.id,
113-
regression_report=self.report,
114-
frequency=self.config.policy.frequency,
105+
regression_report=self.raw_report,
106+
config=self.config,
107+
status=self.status,
115108
)
116109

117110
def run(
@@ -146,7 +139,7 @@ def notify_github_comment(self, github_token: str):
146139
)
147140
return
148141
logger.info("[%s] prepareing gitub comment content", self.config_id)
149-
content = self._to_markdoown()
142+
content = self._to_markdown()
150143
if self.is_dry_run:
151144
logger.info(
152145
"[%s]dry run, skip sending comment to github, report(%s)",
@@ -161,31 +154,24 @@ def notify_github_comment(self, github_token: str):
161154
github_notification.create_github_comment(content, github_token)
162155
logger.info("[%s] done. comment is sent to github", self.config_id)
163156

164-
def _to_markdoown(self):
165-
self.regression_items = self._collect_regression_items()
166-
url = ""
167-
if self.config.hud_info:
168-
url = self.config.hud_info.get("url", "")
169-
170-
md = Template(REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True).render(
157+
def _to_markdown(self) -> str:
158+
regression_items = [
159+
r for r in self.raw_report["results"] if r.get("label") == "regression"
160+
]
161+
url = (self.config.hud_info or {}).get("url", "")
162+
return Template(
163+
REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True
164+
).render(
171165
id=self.id,
172166
url=url,
173167
status=self.status,
174168
report_id=self.config_id,
175-
summary=self.report["summary"],
169+
summary=self.raw_report["summary"],
176170
baseline=self.baseline,
177171
target=self.target,
178172
frequency=self.config.policy.frequency.get_text(),
179-
regression_items=self.regression_items,
173+
regression_items=regression_items,
180174
)
181-
return md
182-
183-
def _collect_regression_items(self) -> list[PerGroupResult]:
184-
items = []
185-
for item in self.report["results"]:
186-
if item["label"] == "regression":
187-
items.append(item)
188-
return items
189175

190176
def insert_to_db(
191177
self,
@@ -205,7 +191,7 @@ def insert_to_db(
205191

206192
try:
207193
report_json = json.dumps(
208-
self.report, ensure_ascii=False, separators=(",", ":"), default=str
194+
self.report_data, ensure_ascii=False, separators=(",", ":"), default=str
209195
)
210196
except Exception:
211197
logger.exception(
@@ -214,12 +200,12 @@ def insert_to_db(
214200
)
215201
raise
216202

217-
regression_summary = self.report["summary"]
203+
regression_summary = self.raw_report["summary"]
218204
params = {
219205
"id": str(self.id),
220206
"report_id": self.config_id,
221207
"type": self.type,
222-
"status": get_regression_status(self.report["summary"]),
208+
"status": get_regression_status(self.raw_report["summary"]),
223209
"last_record_commit": self.target_latest_commit,
224210
"last_record_ts": last_record_ts,
225211
"regression_count": regression_summary["regression_count"],
@@ -238,9 +224,10 @@ def insert_to_db(
238224
self.config_id,
239225
self.id,
240226
)
241-
logger.info("[dry run] printing db params data")
242227
if self.is_dry_run:
243-
print(json.dumps(params, indent=2, default=str))
228+
logger.info("[dry run] printing db params data")
229+
print({k: v for k, v in params.items() if k != "report_json"})
230+
print(json.dumps(self.report_data, indent=2, default=str))
244231
logger.info("[dry run] Done! Finish printing db params data")
245232
return False
246233
logger.info(
@@ -382,28 +369,32 @@ def _validate_latest_meta_info(
382369
def _to_report_data(
383370
self,
384371
config_id: str,
372+
config: BenchmarkConfig,
385373
regression_report: BenchmarkRegressionReport,
386-
frequency: Frequency,
374+
status: str,
387375
) -> dict[str, Any]:
388376
if not self.target_latest_commit:
389377
raise ValueError(
390378
f"missing commit from new is required, latest is {self.target}"
391379
)
392-
lastest_ts_str = self.target_latest_ts_str
393-
if not lastest_ts_str:
394-
raise ValueError(f"timestamp from new is required, latest is {self.target}")
395380

396-
def to_dict(x): # handle dataclass or dict/object
397-
if dataclasses.is_dataclass(x):
398-
return dataclasses.asdict(x)
399-
if isinstance(x, dict):
400-
return x
401-
return vars(x) if hasattr(x, "__dict__") else {"value": str(x)}
402-
403-
report = to_dict(regression_report)
381+
filtered = self._filter_report(regression_report, config.report_config)
382+
logger.info("policy: %s", to_dict(self.config.policy))
404383
return {
405-
"status": self.status,
384+
"status": status,
406385
"report_id": config_id,
407-
"report": report,
408-
"frequency": frequency.get_text(),
386+
"policy": to_dict(config.policy),
387+
"report": to_dict(filtered),
409388
}
389+
390+
def _filter_report(
391+
self, report: BenchmarkRegressionReport, report_config: ReportConfig
392+
) -> dict[str, Any]:
393+
new_report = {k: v for k, v in report.items() if k != "results"}
394+
level_order = report_config.get_severity_map()
395+
threshold = report_config.get_order()
396+
397+
new_report["results"] = [
398+
r for r in report["results"] if level_order[r["label"]] >= threshold
399+
]
400+
return new_report

aws/lambda/benchmark_regression_summary_report/lambda_function.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#!/usr/bin/env python
22
import argparse
33
import datetime as dt
4-
import json
54
import logging
65
import os
76
import threading
@@ -155,8 +154,9 @@ def process(
155154
config=config, target_ts=target, baseline_ts=baseline
156155
)
157156
regression_report = generator.generate()
158-
if self.is_dry_run:
159-
print(json.dumps(regression_report, indent=2, default=str))
157+
# debugging only
158+
# if self.is_dry_run:
159+
# print(json.dumps(regression_report, indent=2, default=str))
160160
reportManager = ReportManager(
161161
config=config,
162162
regression_report=regression_report,

0 commit comments

Comments
 (0)