add github notification

yangw-dev · yangw-dev · commit 6aa36768a999 · 2025-09-04T19:36:39.000-07:00
ghstack-source-id: 87b959c Pull-Request: #7096
diff --git a/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py b/aws/lambda/benchmark_regression_summary_report/common/benchmark_time_series_api_model.py
@@ -1,6 +1,5 @@
-import datetime as dt
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 import requests
 
@@ -63,24 +62,3 @@ def from_request(
         except Exception as e:
             raise RuntimeError(f"Malformed API payload: {e}")
         return cls(data=BenchmarkTimeSeriesApiData(time_series=ts, time_range=tr))
-
-
-def get_latest_meta_info(
-    time_series: List[BenchmarkTimeSeriesItem],
-) -> Optional[dict[str, Any]]:
-    if not time_series:
-        return None
-
-    pts = [p for s in time_series for p in s.data]
-    latest = max(
-        pts,
-        key=lambda p: dt.datetime.fromisoformat(
-            p["granularity_bucket"].replace("Z", "+00:00")
-        ),
-    )
-    return {
-        "commit": latest.get("commit", ""),
-        "branch": latest.get("branch", ""),
-        "timestamp": latest.get("granularity_bucket", ""),
-        "workflow_id": latest.get("workflow_id", ""),
-    }
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config.py b/aws/lambda/benchmark_regression_summary_report/common/config.py
@@ -15,7 +15,6 @@
 # their own benchmark regression config, currenlty place
 # here for lambda
 
-
 COMPILER_BENCHMARK_CONFIG = BenchmarkConfig(
     name="Compiler Benchmark Regression",
     id="compiler_regression",
@@ -44,6 +43,9 @@
                 }
                 """,
     ),
+    hud_info={
+        "url": "https://hud.pytorch.org/benchmark/compilers",
+    },
     # set baseline from past 7 days using avg, and compare with the last 1 day
     policy=Policy(
         frequency=Frequency(value=1, unit="days"),
@@ -67,7 +69,7 @@
             "compression_ratio": RegressionPolicy(
                 name="compression_ratio",
                 condition="greater_equal",
-                threshold=0.9,
+                threshold=0.95,
                 baseline_aggregation="max",
             ),
         },
diff --git a/aws/lambda/benchmark_regression_summary_report/common/config_model.py b/aws/lambda/benchmark_regression_summary_report/common/config_model.py
@@ -241,6 +241,7 @@ class BenchmarkConfig:
     id: str
     source: BenchmarkApiSource
     policy: Policy
+    hud_info: Optional[dict[str, Any]] = None
 
 
 @dataclass
diff --git a/aws/lambda/benchmark_regression_summary_report/common/report_manager.py b/aws/lambda/benchmark_regression_summary_report/common/report_manager.py
@@ -12,9 +12,62 @@
     get_regression_status,
     PerGroupResult,
 )
+from jinja2 import Template
 
 
 logger = logging.getLogger()
+REPORT_MD_TEMPLATE = """# Benchmark Report {{ id }}
+config_id: `{{ report_id }}`
+
+We have detected **{{ status }}** in benchmark results for `{{ report_id }}` (id: `{{ id }}`).
+Full report details will be available in HUD soon)
+
+> **Status:** {{ status }} · **Frequency:** {{ frequency }}
+
+## Summary
+| Metric | Value |
+| :-- | --: |
+| Total | {{ summary.total_count | default(0) }} |
+| Regressions | {{ summary.regression_count | default(0) }} |
+| Suspicious | {{ summary.suspicious_count | default(0) }} |
+| No Regression | {{ summary.no_regression_count | default(0) }} |
+| Insufficient Data | {{ summary.insufficient_data_count | default(0) }} |
+
+## Data Windows
+baseline is the data we aggregated (max, min, earliest, latest) as measurement to decide whether metric values
+in target are considered regressions.
+
+### Baseline window
+- **Start:** `{{ baseline.start.timestamp | default('') }}` (commit: `{{ baseline.start.commit | default('') }}`)
+- **End:** `{{ baseline.end.timestamp   | default('') }}` (commit: `{{ baseline.end.commit   | default('') }}`)
+
+### Target window
+- **Start:** `{{ target.start.timestamp | default('') }}` (commit: `{{ target.start.commit | default('') }}`)
+- **End:** `{{ target.end.timestamp   | default('') }}` (commit: `{{ target.end.commit   | default('') }}`)
+
+{% if regression_items and regression_items|length > 0 %}
+## Regression Glance
+{% if url %}
+Use items below in [HUD]({{ url }}) to see regression.
+{% endif %}
+
+{% set items = regression_items if regression_items|length <= 10 else regression_items[:10] %}
+{% if regression_items|length > 10 %}
+… (showing first 10 only, total {{ regression_items|length }} regressions)
+{% endif %}
+{% for item in items %}
+{% set kv = item.group_info|dictsort %}
+{{ "" }}|{% for k, _ in kv %}{{ k }} |{% endfor %}{{ "\n" -}}
+|{% for _k, _ in kv %}---|{% endfor %}{{ "\n" -}}
+|{% for _k, v in kv %}{{ v }} |{% endfor %}{{ "\n\n" -}}
+{% if item.baseline_point -%}
+- **startTime**: {{ item.baseline_point.timestamp }}, **endTime**: {{ target.end.timestamp }}
+- **lcommit**: `{{ item.baseline_point.commit }}`, **rcommit**: `{{ target.end.commit }}`
+{{ "\n" }}
+{%- endif %}
+{% endfor %}
+{% endif %}
+"""
 
 
 class ReportManager:
@@ -68,6 +121,47 @@ def run(
         except Exception as e:
             logger.error(f"failed to insert report to db, error: {e}")
             raise
+        self.notify_github_comment(github_token)
+
+    def notify_github_comment(self, github_token: str):
+        if self.status != "regression":
+            logger.info(
+                "[%s] no regression found, skip notification",
+                self.config_id,
+            )
+            return
+
+        github_notification = self.config.policy.get_github_notification_config()
+        if not github_notification:
+            logger.info(
+                "[%s] no github notification config found, skip notification",
+                self.config_id,
+            )
+            return
+        logger.info("[%s] prepareing content", self.config_id)
+        content = self._to_markdoown()
+        logger.info("[%s] create comment to github issue", self.config_id)
+        github_notification.create_github_comment(content, github_token)
+        logger.info("[%s] done. comment is sent to github", self.config_id)
+
+    def _to_markdoown(self):
+        self.regression_items = self._collect_regression_items()
+        url = ""
+        if self.config.hud_info:
+            url = self.config.hud_info.get("url", "")
+
+        md = Template(REPORT_MD_TEMPLATE, trim_blocks=True, lstrip_blocks=True).render(
+            id=self.id,
+            url=url,
+            status=self.status,
+            report_id=self.config_id,
+            summary=self.report["summary"],
+            baseline=self.baseline,
+            target=self.target,
+            frequency=self.config.policy.frequency.get_text(),
+            regression_items=self.regression_items,
+        )
+        return md
 
     def _collect_regression_items(self) -> list[PerGroupResult]:
         items = []
diff --git a/aws/lambda/benchmark_regression_summary_report/lambda_function.py b/aws/lambda/benchmark_regression_summary_report/lambda_function.py
@@ -18,7 +18,9 @@
 from dateutil.parser import isoparse
 
 
+# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created
 BENCHMARK_REGRESSION_REPORT_TABLE = "fortesting.benchmark_regression_report"
+BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]
 
 logging.basicConfig(
     level=logging.INFO,
@@ -33,9 +35,6 @@
     "CLICKHOUSE_USERNAME": os.getenv("CLICKHOUSE_USERNAME", ""),
 }
 
-# TODO(elainewy): change this to benchmark.benchmark_regression_report once the table is created
-BENCHMARK_REGRESSION_TRACKING_CONFIG_IDS = ["compiler_regression"]
-
 
 def format_ts_with_t(ts: int) -> str:
     return dt.datetime.fromtimestamp(ts, tz=dt.timezone.utc).strftime(
@@ -138,7 +137,6 @@ def process(
                 f"with frequency {report_freq.get_text()}..."
             )
 
-        self.log_info("get target data")
         target, ls, le = self.get_target(config, self.end_time)
         if not target:
             self.log_info(
@@ -172,7 +170,8 @@ def get_target(self, config: BenchmarkConfig, end_time: int):
         target_s = end_time - data_range.comparison_timedelta_s()
         target_e = end_time
         self.log_info(
-            f"get baseline data for time range [{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}]"
+            "getting target data for time range "
+            f"[{format_ts_with_t(target_s)},{format_ts_with_t(target_e)}] ..."
         )
         target_data = self._fetch_from_benchmark_ts_api(
             config_id=config.id,
@@ -181,7 +180,7 @@ def get_target(self, config: BenchmarkConfig, end_time: int):
             source=config.source,
         )
         self.log_info(
-            f"found {len(target_data.time_series)} # of data, with time range {target_data.time_range}",
+            f"done. found {len(target_data.time_series)} # of data groups, with time range {target_data.time_range}",
         )
         if not target_data.time_range or not target_data.time_range.end:
             return None, target_s, target_e
@@ -196,7 +195,8 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int):
         baseline_s = end_time - data_range.total_timedelta_s()
         baseline_e = end_time - data_range.comparison_timedelta_s()
         self.log_info(
-            f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]"
+            "getting baseline data for time range "
+            f"[{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}] ..."
         )
         # fetch baseline from api
         raw_data = self._fetch_from_benchmark_ts_api(
@@ -207,11 +207,7 @@ def get_baseline(self, config: BenchmarkConfig, end_time: int):
         )
 
         self.log_info(
-            f"get baseline data for time range [{format_ts_with_t(baseline_s)},{format_ts_with_t(baseline_e)}]"
-        )
-
-        self.log_info(
-            f"found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}",
+            f"Done. found {len(raw_data.time_series)} # of data, with time range {raw_data.time_range}",
         )
 
         baseline_latest_ts = int(isoparse(raw_data.time_range.end).timestamp())
@@ -269,11 +265,8 @@ def _fetch_from_benchmark_ts_api(
             )
 
             elapsed_ms = (time.perf_counter() - t0) * 1000.0
-            logger.info(
-                "[%s] call OK in %.1f ms (query_len=%d)",
-                config_id,
-                elapsed_ms,
-                len(query),
+            self.log_info(
+                f"call OK in {elapsed_ms} ms (query_len={len(query)})",
             )
             return resp.data
         except requests.exceptions.HTTPError as e:
@@ -290,7 +283,7 @@ def _fetch_from_benchmark_ts_api(
                     else str(e)
                 )
             self.log_error(
-                f"[{config_id}] call FAILED in {elapsed_ms} ms: {err_msg}",
+                f"call FAILED in {elapsed_ms} ms: {err_msg}",
             )
             raise