From b62509f5fec4fec49292ad3c8207892b4fd9b7f9 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Tue, 1 Apr 2025 09:57:05 -0400
Subject: [PATCH] feat: split process_data out from run_training

process_data=True was allowing users to implicitly process their data via run_training
since `ilab` isn't the main consumption point of the training library going forward, it makes sense to separate these concerns.

remove `data_output_dir` and assume `data_path` is processed data.
also remove the `process_data` argument.

Adjust documentation, notebooks, etc

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 README.md                                    | 11 +++++------
 examples/01_building_a_reasoning_model.ipynb | 19 ++++++++++++++-----
 src/instructlab/training/config.py           | 10 ++--------
 src/instructlab/training/main_ds.py          | 17 +----------------
 4 files changed, 22 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 3a931f51..cd02f65e 100644
--- a/README.md
+++ b/README.md
@@ -92,9 +92,8 @@ for training jobs. There are a number of options you can specify, such as settin
 | Field | Description |
 | --- | --- |
 | model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format.  |
-| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format.  |
+| data_path | A path to the `.jsonl` training dataset. This is expected to be processed (post filtering/tokenization/masking).  |
 | ckpt_output_dir | Directory where trained model checkpoints will be saved. |
-| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) |
 |  max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. |
 | max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. |
 | num_epochs | Number of epochs to run through before stopping. |
@@ -281,7 +280,7 @@ training_args = TrainingArgs(
     model_path = "ibm-granite/granite-7b-base",
     data_path = "path/to/dataset.jsonl",
     ckpt_output_dir = "data/saved_checkpoints",
-    data_output_dir = "data/outputs",
+    data_path = "data/outputs/data.jsonl",
 
     # define model-trianing parameters
     max_seq_len = 4096,
@@ -335,13 +334,14 @@ from instructlab.training import (
     DataProcessArgs,
     data_process as dp
 )
+import os
 
 training_args = TrainingArgs(
     # define data-specific arguments
     model_path = "ibm-granite/granite-7b-base",
     data_path = "path/to/dataset.jsonl",
     ckpt_output_dir = "data/saved_checkpoints",
-    data_output_dir = "data/outputs",
+    data_path = "data/outputs/data.jsonl",
 
     # define model-trianing parameters
     max_seq_len = 4096,
@@ -352,12 +352,11 @@ training_args = TrainingArgs(
     learning_rate = 2e-6,
     warmup_steps = 800,
     random_seed = 42,
-    process_data = True,
 )
 ...
 
 data_process_args = DataProcessArgs(
-    data_output_path = training_args.data_output_dir,
+    data_output_path = os.path.dirname(training_args.data_path),
     model_path = training_args.model_path,
     data_path = training_args.data_path,
     max_seq_len = training_args.max_seq_len,
diff --git a/examples/01_building_a_reasoning_model.ipynb b/examples/01_building_a_reasoning_model.ipynb
index 2ba27c4c..a39096c1 100644
--- a/examples/01_building_a_reasoning_model.ipynb
+++ b/examples/01_building_a_reasoning_model.ipynb
@@ -123,8 +123,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n",
-    "from instructlab.training.main_ds import run_training"
+    "from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions,DataProcessArgs\n",
+    "from instructlab.training.main_ds import run_training\n",
+    "from instructlab.training.data_process import process_data as dp\n",
+    "import os"
    ]
   },
   {
@@ -166,7 +168,7 @@
     "\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n",
     "\tdata_path=\"nemotron.jsonl\",\n",
     "\tckpt_output_dir=\"experiments/training_output\",\n",
-    "\tdata_output_dir=\"data/processed-data\",                    # processed data ids/labels/masks\n",
+    "\tdata_path=\"data/processed-data/data.jsonl\",                    # processed data ids/labels/masks\n",
     "\tmax_seq_len=20000,\n",
     "\tmax_batch_len=30000,                                      # max tokens per gpu\n",
     "\tnum_epochs=3, \n",
@@ -176,9 +178,15 @@
     "    save_samples=0,                                           # save ckpt after num of samples seen (0=off)\n",
     "    checkpoint_at_epoch = True,                               # save ckpt after every epoch\n",
     "    accelerate_full_state_at_epoch = False,                   # save full-state for resuming\n",
-    "    process_data=True,                                        # can set to false if data processed before\n",
     "\tdistributed_backend=DistributedBackend.FSDP,\n",
     "\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n",
+    ")\n",
+    "data_process_args = DataProcessArgs(\n",
+    "    data_output_path = os.path.dirname(train_args.data_path),\n",
+    "    model_path = train_args.model_path,\n",
+    "    data_path = train_args.data_path,\n",
+    "    max_seq_len = train_args.max_seq_len,\n",
+    "    chat_tmpl_path =  train_args.chat_tmpl_path\n",
     ")"
    ]
   },
@@ -186,7 +194,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Finally, we kick off SFT via the run_training function:"
+    "Finally, we process the data and then kick off SFT via the run_training function:"
    ]
   },
   {
@@ -195,6 +203,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "dp.main(data_process_args)\n",
     "run_training(torch_args=torch_args,train_args=train_args)"
    ]
   },
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index 7229d446..d704cafb 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -162,13 +162,10 @@ class TrainingArgs(BaseModel):
     # this field determines if ibm_legacy_tmpl should be used instead
     use_legacy_tmpl: bool = False
 
-    # this field specifies the filepath to the training dataset before processing
+    # this field specifies the filepath to the training dataset
     data_path: str
-    ckpt_output_dir: str
 
-    # this field defines where we should be saving the processed version of the training dataset
-    # after we have tokenized it
-    data_output_dir: str
+    ckpt_output_dir: str
 
     max_seq_len: int
     max_batch_len: int
@@ -207,9 +204,6 @@ class TrainingArgs(BaseModel):
     # quantize_dtype: QuantizeDataType = QuantizeDataType.NONE
     lora: LoraOptions | None = None
 
-    # This field defines whether or not data processing will occur inside of `run_training()`
-    process_data: Optional[bool] = True
-
     # This field specifies whether only the last checkpoint should be retained. When set to true, it
     # will overwrite the previous checkpoint directory, keeping only one directory called
     # "last_epoch". This works alongside the '--checkpoint_at_epoch' flag.
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index a7b8f8d5..b18a79d2 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -77,7 +77,6 @@
     set_random_seed,
     setup_logger,
 )
-import instructlab.training.data_process as dp
 
 
 def setup_optimizer(args, model):
@@ -669,20 +668,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
             os.path.dirname(__file__), "chat_templates/ibm_legacy_tmpl.py"
         )
 
-    if train_args.process_data:
-        # TODO(osilkin):
-        #   Decouple the data processing logic from training.
-        #   Now that we've decided that repos will be less tethered to the
-        #   design choices of the `ilab` CLI, we can make this change.
-        dp.process_data(
-            data_output_path=train_args.data_output_dir,
-            model_path=train_args.model_path,
-            data_path=train_args.data_path,
-            max_seq_len=train_args.max_seq_len,
-            chat_tmpl_path=train_args.chat_tmpl_path,
-            num_cpu_procs=train_args.data_process_num_cpu_procs,
-        )
-
     if not os.path.exists(train_args.ckpt_output_dir):
         os.makedirs(train_args.ckpt_output_dir, exist_ok=True)
 
@@ -695,7 +680,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         f"--rdzv_endpoint={torch_args.rdzv_endpoint}",
         __file__,
         f"--model_name_or_path={train_args.model_path}",
-        f"--data_path={train_args.data_output_dir}/data.jsonl",
+        f"--data_path={train_args.data_path}",
         f"--output_dir={train_args.ckpt_output_dir}",
         f"--num_epochs={train_args.num_epochs}",
         f"--effective_batch_size={train_args.effective_batch_size}",