We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 64e4cec commit 1b3d37dCopy full SHA for 1b3d37d
torchtitan/models/deepseek_v3/__init__.py
@@ -75,6 +75,8 @@
75
qk_rope_head_dim=64,
76
v_head_dim=128,
77
mscale=0.70,
78
+ use_flex_attn=True,
79
+ attn_mask_type="block_causal",
80
),
81
"16B": DeepSeekV3ModelArgs(
82
vocab_size=102400,
0 commit comments