Support _native_multi_head_attention.

Xreki · Xreki · commit b5bd25b645cd · 2025-09-16T16:37:08.000+08:00
diff --git a/graph_net/torch/collect_stats.py b/graph_net/torch/collect_stats.py
@@ -55,6 +55,20 @@ class OpStat:
     count: int = 0
 
 
+def resolve_native_multi_head_attention(*args, **kwargs):
+    query, key, value = args[0], args[1], args[2]
+    seq_len, batch_size, embed_dim = query.shape
+    attn_output = torch.empty(
+        (seq_len, batch_size, embed_dim), dtype=query.dtype, device="meta"
+    )
+
+    # seq_len_k = key.shape[0]
+    # num_heads = args[4]
+    # attn_output_weights = torch.empty((batch_size, num_heads, seq_len, seq_len_k),
+    #                                  dtype=query.dtype, device='meta')
+    return attn_output  # , attn_output_weights
+
+
 def resolve_get_attr(gm: torch.fx.GraphModule, node: torch.fx.Node):
     attr_itr = node.target.split(".")
     val = gm
@@ -65,8 +79,8 @@ def resolve_get_attr(gm: torch.fx.GraphModule, node: torch.fx.Node):
 
 
 def collect_op_stats(model, input_dict):
-    # FX symbolic trace
     try:
+        # FX symbolic trace
         traced = torch.fx.symbolic_trace(model)
         # print(traced.graph)
     except Exception:
@@ -118,7 +132,10 @@ def collect_op_stats(model, input_dict):
                 node_args = node_args[1:]
 
             try:
-                out = op_func(*node_args, **node_kwargs)
+                if op_name == "_native_multi_head_attention":
+                    out = resolve_native_multi_head_attention(*node_args, **node_kwargs)
+                else:
+                    out = op_func(*node_args, **node_kwargs)
                 node_outputs[node.name] = out
                 dtype = out.dtype if isinstance(out, torch.Tensor) else None
             except Exception: