From b62509f5fec4fec49292ad3c8207892b4fd9b7f9 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Tue, 1 Apr 2025 09:57:05 -0400 Subject: [PATCH] feat: split process_data out from run_training process_data=True was allowing users to implicitly process their data via run_training since `ilab` isn't the main consumption point of the training library going forward, it makes sense to separate these concerns. remove `data_output_dir` and assume `data_path` is processed data. also remove the `process_data` argument. Adjust documentation, notebooks, etc Signed-off-by: Charlie Doern --- README.md | 11 +++++------ examples/01_building_a_reasoning_model.ipynb | 19 ++++++++++++++----- src/instructlab/training/config.py | 10 ++-------- src/instructlab/training/main_ds.py | 17 +---------------- 4 files changed, 22 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 3a931f51..cd02f65e 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,8 @@ for training jobs. There are a number of options you can specify, such as settin | Field | Description | | --- | --- | | model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format. | -| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format. | +| data_path | A path to the `.jsonl` training dataset. This is expected to be processed (post filtering/tokenization/masking). | | ckpt_output_dir | Directory where trained model checkpoints will be saved. | -| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) | | max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. | | max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. | | num_epochs | Number of epochs to run through before stopping. | @@ -281,7 +280,7 @@ training_args = TrainingArgs( model_path = "ibm-granite/granite-7b-base", data_path = "path/to/dataset.jsonl", ckpt_output_dir = "data/saved_checkpoints", - data_output_dir = "data/outputs", + data_path = "data/outputs/data.jsonl", # define model-trianing parameters max_seq_len = 4096, @@ -335,13 +334,14 @@ from instructlab.training import ( DataProcessArgs, data_process as dp ) +import os training_args = TrainingArgs( # define data-specific arguments model_path = "ibm-granite/granite-7b-base", data_path = "path/to/dataset.jsonl", ckpt_output_dir = "data/saved_checkpoints", - data_output_dir = "data/outputs", + data_path = "data/outputs/data.jsonl", # define model-trianing parameters max_seq_len = 4096, @@ -352,12 +352,11 @@ training_args = TrainingArgs( learning_rate = 2e-6, warmup_steps = 800, random_seed = 42, - process_data = True, ) ... data_process_args = DataProcessArgs( - data_output_path = training_args.data_output_dir, + data_output_path = os.path.dirname(training_args.data_path), model_path = training_args.model_path, data_path = training_args.data_path, max_seq_len = training_args.max_seq_len, diff --git a/examples/01_building_a_reasoning_model.ipynb b/examples/01_building_a_reasoning_model.ipynb index 2ba27c4c..a39096c1 100644 --- a/examples/01_building_a_reasoning_model.ipynb +++ b/examples/01_building_a_reasoning_model.ipynb @@ -123,8 +123,10 @@ "metadata": {}, "outputs": [], "source": [ - "from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n", - "from instructlab.training.main_ds import run_training" + "from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions,DataProcessArgs\n", + "from instructlab.training.main_ds import run_training\n", + "from instructlab.training.data_process import process_data as dp\n", + "import os" ] }, { @@ -166,7 +168,7 @@ "\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n", "\tdata_path=\"nemotron.jsonl\",\n", "\tckpt_output_dir=\"experiments/training_output\",\n", - "\tdata_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n", + "\tdata_path=\"data/processed-data/data.jsonl\", # processed data ids/labels/masks\n", "\tmax_seq_len=20000,\n", "\tmax_batch_len=30000, # max tokens per gpu\n", "\tnum_epochs=3, \n", @@ -176,9 +178,15 @@ " save_samples=0, # save ckpt after num of samples seen (0=off)\n", " checkpoint_at_epoch = True, # save ckpt after every epoch\n", " accelerate_full_state_at_epoch = False, # save full-state for resuming\n", - " process_data=True, # can set to false if data processed before\n", "\tdistributed_backend=DistributedBackend.FSDP,\n", "\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n", + ")\n", + "data_process_args = DataProcessArgs(\n", + " data_output_path = os.path.dirname(train_args.data_path),\n", + " model_path = train_args.model_path,\n", + " data_path = train_args.data_path,\n", + " max_seq_len = train_args.max_seq_len,\n", + " chat_tmpl_path = train_args.chat_tmpl_path\n", ")" ] }, @@ -186,7 +194,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we kick off SFT via the run_training function:" + "Finally, we process the data and then kick off SFT via the run_training function:" ] }, { @@ -195,6 +203,7 @@ "metadata": {}, "outputs": [], "source": [ + "dp.main(data_process_args)\n", "run_training(torch_args=torch_args,train_args=train_args)" ] }, diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 7229d446..d704cafb 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -162,13 +162,10 @@ class TrainingArgs(BaseModel): # this field determines if ibm_legacy_tmpl should be used instead use_legacy_tmpl: bool = False - # this field specifies the filepath to the training dataset before processing + # this field specifies the filepath to the training dataset data_path: str - ckpt_output_dir: str - # this field defines where we should be saving the processed version of the training dataset - # after we have tokenized it - data_output_dir: str + ckpt_output_dir: str max_seq_len: int max_batch_len: int @@ -207,9 +204,6 @@ class TrainingArgs(BaseModel): # quantize_dtype: QuantizeDataType = QuantizeDataType.NONE lora: LoraOptions | None = None - # This field defines whether or not data processing will occur inside of `run_training()` - process_data: Optional[bool] = True - # This field specifies whether only the last checkpoint should be retained. When set to true, it # will overwrite the previous checkpoint directory, keeping only one directory called # "last_epoch". This works alongside the '--checkpoint_at_epoch' flag. diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index a7b8f8d5..b18a79d2 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -77,7 +77,6 @@ set_random_seed, setup_logger, ) -import instructlab.training.data_process as dp def setup_optimizer(args, model): @@ -669,20 +668,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: os.path.dirname(__file__), "chat_templates/ibm_legacy_tmpl.py" ) - if train_args.process_data: - # TODO(osilkin): - # Decouple the data processing logic from training. - # Now that we've decided that repos will be less tethered to the - # design choices of the `ilab` CLI, we can make this change. - dp.process_data( - data_output_path=train_args.data_output_dir, - model_path=train_args.model_path, - data_path=train_args.data_path, - max_seq_len=train_args.max_seq_len, - chat_tmpl_path=train_args.chat_tmpl_path, - num_cpu_procs=train_args.data_process_num_cpu_procs, - ) - if not os.path.exists(train_args.ckpt_output_dir): os.makedirs(train_args.ckpt_output_dir, exist_ok=True) @@ -695,7 +680,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--rdzv_endpoint={torch_args.rdzv_endpoint}", __file__, f"--model_name_or_path={train_args.model_path}", - f"--data_path={train_args.data_output_dir}/data.jsonl", + f"--data_path={train_args.data_path}", f"--output_dir={train_args.ckpt_output_dir}", f"--num_epochs={train_args.num_epochs}", f"--effective_batch_size={train_args.effective_batch_size}",