Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ description = "DeepSeek-V3 16B model training"
print_args = false

[profiling]
enable_profiling = false
enable_profiling = true
save_traces_folder = "profile_trace"
profile_freq = 10
enable_memory_snapshot = false
enable_memory_snapshot = true
save_memory_snapshot_folder = "memory_snapshot"

[metrics]
Expand Down Expand Up @@ -35,10 +35,10 @@ decay_type = "cosine"
min_lr_factor = 0.1

[training]
local_batch_size = 8
seq_len = 4096
local_batch_size = 4
seq_len = 2048
max_norm = 1.0 # grad norm clipping
steps = 1000
steps = 100
dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)

[parallelism]
Expand All @@ -49,7 +49,7 @@ tensor_parallel_degree = 1
enable_async_tensor_parallel = false
pipeline_parallel_degree = 1
pipeline_parallel_schedule = "Interleaved1F1B"
expert_parallel_degree = 8
expert_parallel_degree = 4
expert_tensor_parallel_degree = 1

[checkpoint]
Expand All @@ -61,11 +61,11 @@ export_dtype = "float32"
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem]"

[activation_checkpoint]
mode = "selective" # ["none", "selective", "full"]
mode = "full" # ["none", "selective", "full"]
selective_ac_option = 'op' # 'int' = ac every positive int layer or 'op', ac based on ops policy

[compile]
enable=true
enable=false
components = ["loss"] # ["model", "loss"]

[float8]
Expand Down