Skip to content

Commit 910e46d

Browse files
committed
provide more docstrings
Signed-off-by: Oleg Silkin <[email protected]>
1 parent 7c467d4 commit 910e46d

File tree

7 files changed

+348
-267
lines changed

7 files changed

+348
-267
lines changed

processed-data/test-dataprocess.py renamed to examples/test-dataprocess.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1-
# Standard
2-
import argparse
3-
4-
# First Party
5-
from instructlab.training.data_process import process_data
1+
# SPDX-License-Identifier: Apache-2.0
62

73
"""
84
This file showcases how someone can use the data-processing script to
95
take a dataset from `messages` format into raw input_ids and labels
106
"""
117

8+
# Standard
9+
import argparse
10+
11+
# First Party
12+
from instructlab.training.data_process import process_data
13+
1214
parser = argparse.ArgumentParser()
1315

1416
parser.add_argument("--data-path", type=str, required=True)
@@ -38,7 +40,6 @@
3840
data_path=args.data_path,
3941
data_output_path=args.data_output_path,
4042
model_path=model_path,
41-
use_legacy_method=args.legacy,
4243
chat_tmpl_path=args.chat_tmpl_path,
4344
max_seq_len=args.max_seq_len,
4445
num_cpu_procs=1,

src/instructlab/training/config.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,6 @@ class TrainingArgs(BaseModel):
170170
# after we have tokenized it
171171
data_output_dir: str
172172

173-
# New option to use the new data processor implementation
174-
use_legacy_data_processor: bool = Field(
175-
default=False,
176-
description="this field determines if the legacy data processor should be used instead",
177-
)
178-
179173
max_seq_len: int
180174
max_batch_len: int
181175
num_epochs: int
@@ -220,3 +214,12 @@ class TrainingArgs(BaseModel):
220214
# will overwrite the previous checkpoint directory, keeping only one directory called
221215
# "last_epoch". This works alongside the '--checkpoint_at_epoch' flag.
222216
keep_last_checkpoint_only: Optional[bool] = False
217+
218+
# TODO(osilkin):
219+
# we are only exposing this here because `run_training` today is implicitly coupled
220+
# with `process_data`. Since we don't have a specific field for data processing arguments,
221+
# we are forced to expose this. We should uncouple training from data processing and remove this.
222+
data_process_num_cpu_procs: int = Field(
223+
default=16,
224+
description="This is the number of processes used for multiprocessing when processing the data",
225+
)

0 commit comments

Comments
 (0)