Skip to content

Commit c7205b8

Browse files
committed
feat: split process_data out from run_training
process_data=True was allowing users to implicitly process their data via run_training since `ilab` isn't the main consumption point of the training library going forward, it makes sense to separate these concerns. remove `data_output_dir` and assume `data_path` is processed data. also remove the `process_data` argument. Adjust documentation, notebooks, etc Signed-off-by: Charlie Doern <[email protected]>
1 parent 910e46d commit c7205b8

File tree

4 files changed

+22
-36
lines changed

4 files changed

+22
-36
lines changed

README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,8 @@ for training jobs. There are a number of options you can specify, such as settin
9292
| Field | Description |
9393
| --- | --- |
9494
| model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format. |
95-
| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format. |
95+
| data_path | A path to the `.jsonl` training dataset. This is expected to be processed (post filtering/tokenization/masking). |
9696
| ckpt_output_dir | Directory where trained model checkpoints will be saved. |
97-
| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) |
9897
| max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. |
9998
| max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. |
10099
| num_epochs | Number of epochs to run through before stopping. |
@@ -281,7 +280,7 @@ training_args = TrainingArgs(
281280
model_path = "ibm-granite/granite-7b-base",
282281
data_path = "path/to/dataset.jsonl",
283282
ckpt_output_dir = "data/saved_checkpoints",
284-
data_output_dir = "data/outputs",
283+
data_path = "data/outputs/data.jsonl",
285284

286285
# define model-trianing parameters
287286
max_seq_len = 4096,
@@ -335,13 +334,14 @@ from instructlab.training import (
335334
DataProcessArgs,
336335
data_process as dp
337336
)
337+
import os
338338

339339
training_args = TrainingArgs(
340340
# define data-specific arguments
341341
model_path = "ibm-granite/granite-7b-base",
342342
data_path = "path/to/dataset.jsonl",
343343
ckpt_output_dir = "data/saved_checkpoints",
344-
data_output_dir = "data/outputs",
344+
data_path = "data/outputs/data.jsonl",
345345

346346
# define model-trianing parameters
347347
max_seq_len = 4096,
@@ -352,12 +352,11 @@ training_args = TrainingArgs(
352352
learning_rate = 2e-6,
353353
warmup_steps = 800,
354354
random_seed = 42,
355-
process_data = True,
356355
)
357356
...
358357

359358
data_process_args = DataProcessArgs(
360-
data_output_path = training_args.data_output_dir,
359+
data_output_path = os.path.dirname(training_args.data_path),
361360
model_path = training_args.model_path,
362361
data_path = training_args.data_path,
363362
max_seq_len = training_args.max_seq_len,

examples/01_building_a_reasoning_model.ipynb

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,10 @@
123123
"metadata": {},
124124
"outputs": [],
125125
"source": [
126-
"from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n",
127-
"from instructlab.training.main_ds import run_training"
126+
"from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions,DataProcessArgs\n",
127+
"from instructlab.training.main_ds import run_training\n",
128+
"from instructlab.training.data_process import process_data as dp\n",
129+
"import os"
128130
]
129131
},
130132
{
@@ -166,7 +168,7 @@
166168
"\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n",
167169
"\tdata_path=\"nemotron.jsonl\",\n",
168170
"\tckpt_output_dir=\"experiments/training_output\",\n",
169-
"\tdata_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n",
171+
"\tdata_path=\"data/processed-data/data.jsonl\", # processed data ids/labels/masks\n",
170172
"\tmax_seq_len=20000,\n",
171173
"\tmax_batch_len=30000, # max tokens per gpu\n",
172174
"\tnum_epochs=3, \n",
@@ -176,17 +178,23 @@
176178
" save_samples=0, # save ckpt after num of samples seen (0=off)\n",
177179
" checkpoint_at_epoch = True, # save ckpt after every epoch\n",
178180
" accelerate_full_state_at_epoch = False, # save full-state for resuming\n",
179-
" process_data=True, # can set to false if data processed before\n",
180181
"\tdistributed_backend=DistributedBackend.FSDP,\n",
181182
"\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n",
183+
")\n",
184+
"data_process_args = DataProcessArgs(\n",
185+
" data_output_path = os.path.dirname(train_args.data_path),\n",
186+
" model_path = train_args.model_path,\n",
187+
" data_path = train_args.data_path,\n",
188+
" max_seq_len = train_args.max_seq_len,\n",
189+
" chat_tmpl_path = train_args.chat_tmpl_path\n",
182190
")"
183191
]
184192
},
185193
{
186194
"cell_type": "markdown",
187195
"metadata": {},
188196
"source": [
189-
"Finally, we kick off SFT via the run_training function:"
197+
"Finally, we process the data and then kick off SFT via the run_training function:"
190198
]
191199
},
192200
{
@@ -195,6 +203,7 @@
195203
"metadata": {},
196204
"outputs": [],
197205
"source": [
206+
"dp.main(data_process_args)\n",
198207
"run_training(torch_args=torch_args,train_args=train_args)"
199208
]
200209
},

src/instructlab/training/config.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,10 @@ class TrainingArgs(BaseModel):
162162
# this field determines if ibm_legacy_tmpl should be used instead
163163
use_legacy_tmpl: bool = False
164164

165-
# this field specifies the filepath to the training dataset before processing
165+
# this field specifies the filepath to the training dataset
166166
data_path: str
167-
ckpt_output_dir: str
168167

169-
# this field defines where we should be saving the processed version of the training dataset
170-
# after we have tokenized it
171-
data_output_dir: str
168+
ckpt_output_dir: str
172169

173170
max_seq_len: int
174171
max_batch_len: int
@@ -207,9 +204,6 @@ class TrainingArgs(BaseModel):
207204
# quantize_dtype: QuantizeDataType = QuantizeDataType.NONE
208205
lora: LoraOptions | None = None
209206

210-
# This field defines whether or not data processing will occur inside of `run_training()`
211-
process_data: Optional[bool] = True
212-
213207
# This field specifies whether only the last checkpoint should be retained. When set to true, it
214208
# will overwrite the previous checkpoint directory, keeping only one directory called
215209
# "last_epoch". This works alongside the '--checkpoint_at_epoch' flag.

src/instructlab/training/main_ds.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@
7777
set_random_seed,
7878
setup_logger,
7979
)
80-
import instructlab.training.data_process as dp
81-
8280

8381
def setup_optimizer(args, model):
8482
if args.distributed_training_framework == DistributedBackend.FSDP.value:
@@ -669,20 +667,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
669667
os.path.dirname(__file__), "chat_templates/ibm_legacy_tmpl.py"
670668
)
671669

672-
if train_args.process_data:
673-
# TODO(osilkin):
674-
# Decouple the data processing logic from training.
675-
# Now that we've decided that repos will be less tethered to the
676-
# design choices of the `ilab` CLI, we can make this change.
677-
dp.process_data(
678-
data_output_path=train_args.data_output_dir,
679-
model_path=train_args.model_path,
680-
data_path=train_args.data_path,
681-
max_seq_len=train_args.max_seq_len,
682-
chat_tmpl_path=train_args.chat_tmpl_path,
683-
num_cpu_procs=train_args.data_process_num_cpu_procs,
684-
)
685-
686670
if not os.path.exists(train_args.ckpt_output_dir):
687671
os.makedirs(train_args.ckpt_output_dir, exist_ok=True)
688672

@@ -695,7 +679,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
695679
f"--rdzv_endpoint={torch_args.rdzv_endpoint}",
696680
__file__,
697681
f"--model_name_or_path={train_args.model_path}",
698-
f"--data_path={train_args.data_output_dir}/data.jsonl",
682+
f"--data_path={train_args.data_path}",
699683
f"--output_dir={train_args.ckpt_output_dir}",
700684
f"--num_epochs={train_args.num_epochs}",
701685
f"--effective_batch_size={train_args.effective_batch_size}",

0 commit comments

Comments
 (0)