add github notification

yangw-dev · yangw-dev · commit 49c8c2210bf2 · 2025-09-08T20:31:22.000-07:00
ghstack-source-id: ff6f673 Pull-Request: #7096
diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
@@ -1,6 +1,5 @@
-import datetime as dt
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 import requests
 
@@ -63,24 +62,3 @@ def from_request(
         except Exception as e:
             raise RuntimeError(f"Malformed API payload: {e}")
         return cls(data=BenchmarkTimeSeriesApiData(time_series=ts, time_range=tr))
-
-
-def get_latest_meta_info(
-    time_series: List[BenchmarkTimeSeriesItem],
-) -> Optional[dict[str, Any]]:
-    if not time_series:
-        return None
-
-    pts = [p for s in time_series for p in s.data]
-    latest = max(
-        pts,
-        key=lambda p: dt.datetime.fromisoformat(
-            p["granularity_bucket"].replace("Z", "+00:00")
-        ),
-    )
-    return {
-        "commit": latest.get("commit", ""),
-        "branch": latest.get("branch", ""),
-        "timestamp": latest.get("granularity_bucket", ""),
-        "workflow_id": latest.get("workflow_id", ""),
-    }
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -15,7 +15,6 @@
 # their own benchmark regression config, currenlty place
 # here for lambda
 
-
 COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
     name="Compiler Benchmark Regression",
     id="compiler_regression",
@@ -44,6 +43,9 @@
                 }
                 """,
     ),
+    hud_info={
+        "url": "https://hud.pytorch.org/benchmark/compilers",
+    },
     # set baseline from past 7 days using avg, and compare with the last 1 day
     policy=Policy(
         frequency=Frequency(value=1, unit="days"),
@@ -67,7 +69,7 @@
             "compression_ratio": RegressionPolicy(
                 name="compression_ratio",
                 condition="greater_equal",
-                threshold=0.9,
+                threshold=0.95,
                 baseline_aggregation="max",
             ),
         },
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -241,6 +241,7 @@ class BenchmarkConfig:
     id: str
     source: BenchmarkApiSource
     policy: Policy
+    hud_info: Optional[dict[str, Any]] = None
 
 
 @dataclass
diff --git a/aws/lambda/benchmark_regression_summary_report/common/report_manager.py b/aws/lambda/benchmark_regression_summary_report/common/report_manager.py
@@ -12,9 +12,63 @@
     get_regression_status,
     PerGroupResult,
 )
+from jinja2 import Template
 
 
 logger = logging.getLogger()
+REPORT_MD_TEMPLATE = """# Benchmark Report {{ id }}
+config_id: `{{ report_id }}`
+
+We have detected **{{ status }}** in benchmark results for `{{ report_id }}` (id: `{{ id }}`).
+(HUD benchmark regression page coming soon...)
+
+> **Status:** {{ status }} · **Frequency:** {{ frequency }}
+
+## Summary
+| Metric | Value |
+| :-- | --: |
+| Total | {{ summary.total_count | default(0) }} |
+| Regressions | {{ summary.regression_count | default(0) }} |
+| Suspicious | {{ summary.suspicious_count | default(0) }} |
+| No Regression | {{ summary.no_regression_count | default(0) }} |
+| Insufficient Data | {{ summary.insufficient_data_count | default(0) }} |
+
+## Data Windows
+Baseline is a single reference value (e.g., mean, max, min, latest) aggregated from the previous few days,
+used to detect regressions by comparing against metric values in the target window.
+
+### Baseline window (used to calculate baseline value)
+- **Start:** `{{ baseline.start.timestamp | default('') }}` (commit: `{{ baseline.start.commit | default('') }}`)
+- **End:** `{{ baseline.end.timestamp   | default('') }}` (commit: `{{ baseline.end.commit   | default('') }}`)
+
+### Target window (used to compare against baseline value)
+- **Start:** `{{ target.start.timestamp | default('') }}` (commit: `{{ target.start.commit | default('') }}`)
+- **End:** `{{ target.end.timestamp   | default('') }}` (commit: `{{ target.end.commit   | default('') }}`)
+
+{% if regression_items and regression_items|length > 0 %}
+## Regression Glance
+{% if url %}
+Use items below in [HUD]({{ url }}) to see regression.
+{% endif %}
+
+{% set items = regression_items if regression_items|length <= 10 else regression_items[:10] %}
+{% if regression_items|length > 10 %}
+… (showing first 10 only, total {{ regression_items|length }} regressions)
+{% endif %}
+{% for item in items %}
+{% set kv = item.group_info|dictsort %}
+{{ "" }}|{% for k, _ in kv %}{{ k }} |{% endfor %}{{ "\n" -}}
+|{% for _k, _ in kv %}---|{% endfor %}{{ "\n" -}}
+|{% for _k, v in kv %}{{ v }} |{% endfor %}{{ "\n\n" -}}
+{% if item.baseline_point -%}
+- **baseline**: {{ item.baseline_point.value}},
+- **startTime**: {{ item.baseline_point.timestamp }}, **endTime**: {{ target.end.timestamp }}
+- **lcommit**: `{{ item.baseline_point.commit }}`, **rcommit**: `{{ target.end.commit }}`
+{{ "\n" }}
+{%- endif %}
+{% endfor %}
+{% endif %}
+"""
 
 
 class ReportManager:
@@ -64,10 +118,64 @@ def run(
         main method used to insert the report to db and create github comment in targeted issue
         """
         try:
-            self.insert_to_db(cc)
+            applied_insertion = self.insert_to_db(cc)
         except Exception as e:
             logger.error(f"failed to insert report to db, error: {e}")
             raise
+        if not applied_insertion:
+            logger.info("[%s] skip notification,  already exists in db", self.config_id)
+            return
+        self.notify_github_comment(github_token)
+
+    def notify_github_comment(self, github_token: str):
+        if self.status != "regression":
+            logger.info(
+                "[%s] no regression found, skip notification",
+                self.config_id,
+            )
+            return
+
+        github_notification = self.config.policy.get_github_notification_config()
+        if not github_notification:
+            logger.info(
+                "[%s] no github notification config found, skip notification",
+                self.config_id,
+            )
+            return
+        logger.info("[%s] prepareing gitub comment content", self.config_id)
+        content = self._to_markdoown()
+        if self.is_dry_run:
+            logger.info(
+                "[%s]dry run, skip sending comment to github, report(%s)",
+                self.config_id,
+                self.id,
+            )
+            logger.info("[dry run] printing comment content")
+            print(json.dumps(content, indent=2, default=str))
+            logger.info("[dry run] Done! Finish printing comment content")
+            return
+        logger.info("[%s] create comment to github issue", self.config_id)
+        github_notification.create_github_comment(content, github_token)
+        logger.info("[%s] done. comment is sent to github", self.config_id)
+
+    def _to_markdoown(self):
+        self.regression_items = self._collect_regression_items()
+        url = ""
+        if self.config.hud_info:
+            url = self.config.hud_info.get("url", "")
+
+        md = Template(REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True).render(
+            id=self.id,
+            url=url,
+            status=self.status,
+            report_id=self.config_id,
+            summary=self.report["summary"],
+            baseline=self.baseline,
+            target=self.target,
+            frequency=self.config.policy.frequency.get_text(),
+            regression_items=self.regression_items,
+        )
+        return md
 
     def _collect_regression_items(self) -> list[PerGroupResult]:
         items = []
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -18,7 +18,9 @@
 from dateutil.parser import isoparse
 
 
+# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created
 BENCHMARK_REGRESSION_REPORT_TABLE = "fortesting.benchmark_regression_report"
+BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]
 
 logging.basicConfig(
     level=logging.INFO,
@@ -33,9 +35,6 @@
     "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
 }
 
-# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created
-BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]
-
 
 def format_ts_with_t(ts: int) -> str:
     return dt.datetime.fromtimestamp(ts, tz=dt.timezone.utc).strftime(
@@ -54,7 +53,6 @@ def get_clickhouse_client(
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True, verify=False
     )
-
     return clickhouse_connect.get_client(
         host=host, user=user, password=password, secure=True
     )
@@ -77,8 +75,10 @@ def __init__(
         config_id: str,
         end_time: int,
         is_dry_run: bool = False,
+        is_pass_check: bool = False,
     ) -> None:
         self.is_dry_run = is_dry_run
+        self.is_pass_check = is_pass_check
         self.config_id = config_id
         self.end_time = end_time
 
@@ -127,6 +127,7 @@ def process(
         should_generate = self._should_generate_report(
             cc, self.end_time, self.config_id, report_freq
         )
+
         if not should_generate:
             self.log_info(
                 "Skip generate report",
@@ -138,7 +139,6 @@ def process(
                 f"with frequency {report_freq.get_text()}..."
             )
 
-        self.log_info("get target data")
         target, ls, le = self.get_target(config, self.end_time)
         if not target:
             self.log_info(
@@ -157,12 +157,12 @@ def process(
         regression_report = generator.generate()
         if self.is_dry_run:
             print(json.dumps(regression_report, indent=2, default=str))
-            return
 
         reportManager = ReportManager(
             config=config,
             regression_report=regression_report,
             db_table_name=BENCHMARK_REGRESSION_REPORT_TABLE,
+            is_dry_run=self.is_dry_run,
         )
         reportManager.run(cc, ENVS["GITHUB_TOKEN"])
         return
@@ -172,7 +172,8 @@ def get_target(self, config: BenchmarkConfig, end_time: int):
         target_s = end_time - data_range.comparison_timedelta_s()
         target_e = end_time
         self.log_info(
-            f"get baseline data for time range [{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}]"
+            "getting target data for time range "
+            f"[{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}] ..."
         )
         target_data = self._fetch_from_benchmark_ts_api(
             config_id=config.id,
@@ -181,7 +182,7 @@ def get_target(self, config: BenchmarkConfig, end_time: int):
             source=config.source,
         )
         self.log_info(
-            f"found {len(target_data.time_series)} # of data, with time range {target_data.time_range}",
+            f"done. found {len(target_data.time_series)} # of data groups, with time range {target_data.time_range}",
         )
         if not target_data.time_range or not target_data.time_range.end:
             return None, target_s, target_e
@@ -196,7 +197,8 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int):
         baseline_s = end_time - data_range.total_timedelta_s()
         baseline_e = end_time - data_range.comparison_timedelta_s()
         self.log_info(
-            f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]"
+            "getting baseline data for time range "
+            f"[{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}] ..."
         )
         # fetch baseline from api
         raw_data = self._fetch_from_benchmark_ts_api(
@@ -207,11 +209,7 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int):
         )
 
         self.log_info(
-            f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]"
-        )
-
-        self.log_info(
-            f"found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}",
+            f"Done. found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}",
         )
 
         baseline_latest_ts = int(isoparse(raw_data.time_range.end).timestamp())
@@ -269,11 +267,8 @@ def _fetch_from_benchmark_ts_api(
             )
 
             elapsed_ms = (time.perf_counter() - t0) * 1000.0
-            logger.info(
-                "[%s] call OK in %.1f ms (query_len=%d)",
-                config_id,
-                elapsed_ms,
-                len(query),
+            self.log_info(
+                f"call OK in {elapsed_ms} ms (query_len={len(query)})",
             )
             return resp.data
         except requests.exceptions.HTTPError as e:
@@ -290,7 +285,7 @@ def _fetch_from_benchmark_ts_api(
                     else str(e)
                 )
             self.log_error(
-                f"[{config_id}] call FAILED in {elapsed_ms} ms: {err_msg}",
+                f"call FAILED in {elapsed_ms} ms: {err_msg}",
             )
             raise
 
@@ -348,6 +343,12 @@ def _get_latest_record_ts(
                 f"time_boundary({format_ts_with_t(time_boundary)})"
                 f"based on latest_record_ts({format_ts_with_t(latest_record_ts)})",
             )
+        # dry_run is True, is_pass_check is True, then we allow to generate report even the time check is not met
+        if self.is_dry_run and self.is_pass_check:
+            should_generate = True
+            self.log_info(
+                f"[{f.get_text()}] dry_run is True, is_pass_check is True, force generate report for print only",
+            )
         return should_generate
 
 
@@ -357,7 +358,12 @@ def main(
     args: Optional[argparse.Namespace] = None,
     *,
     is_dry_run: bool = False,
+    is_forced: bool = False,
 ):
+    if not is_dry_run and is_forced:
+        is_forced = False
+        logger.info("is_dry_run is False, force  must be disabled, this is not allowed")
+
     if not github_access_token:
         raise ValueError("Missing environment variable GITHUB_TOKEN")
 
@@ -378,7 +384,10 @@ def main(
     # caution, raise exception may lead lambda to retry
     try:
         processor = BenchmarkSummaryProcessor(
-            config_id=config_id, end_time=end_time_ts, is_dry_run=is_dry_run
+            config_id=config_id,
+            end_time=end_time_ts,
+            is_dry_run=is_dry_run,
+            is_pass_check=is_forced,
         )
         processor.process(args=args)
     except Exception as e:
@@ -419,6 +428,12 @@ def parse_args() -> argparse.Namespace:
         action="store_false",
         help="Disable dry-run mode",
     )
+    parser.add_argument(
+        "--force",
+        dest="force",
+        action="store_true",
+        help="Enable force mode, this only allowed when dry-run is enabled",
+    )
     parser.add_argument(
         "--config-id",
         type=str,
@@ -466,6 +481,7 @@ def local_run() -> None:
         github_access_token=args.github_access_token,
         args=args,
         is_dry_run=args.dry_run,
+        is_forced=args.force,
     )