From 347d22f3ec094c9931818cbbff6b5dacf1e83fbf Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:22:36 +0200 Subject: [PATCH 001/142] added script to train tokenizer only on a subset of the dataset --- scripts/lang_adapt/tokenized4clm_sampled.py | 67 ++ .../madx_exp/madxlastlayer_lngembft_clm.py | 618 ++++++++++++++++++ 2 files changed, 685 insertions(+) create mode 100644 scripts/lang_adapt/tokenized4clm_sampled.py create mode 100644 scripts/madx_exp/madxlastlayer_lngembft_clm.py diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py new file mode 100644 index 0000000..672277a --- /dev/null +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -0,0 +1,67 @@ +import torch +import datasets +import transformers +from transformers import AutoTokenizer +from datasets import load_dataset +import pathlib + +import argparse +import sys + +import logging +logger = logging.getLogger(__name__) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) +log_level = -1 +logger.setLevel(log_level) +datasets.utils.logging.set_verbosity(log_level) +transformers.utils.logging.set_verbosity(log_level) +transformers.utils.logging.enable_default_handler() +transformers.utils.logging.enable_explicit_format() +tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + +parser = argparse.ArgumentParser() +parser.add_argument('--lang', type=str, required=True) +parser.add_argument('--tokenizer_dir', type=str, required=True) +parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) +parser.add_argument('--vocab_size', default=130_000, type=int) +parser.add_argument('--extend_vocab', action='store_true') +parser.add_argument('--sample_size', default=None, type=int) + +args = parser.parse_args() +lang = args.lang +if args.extend_vocab: + assert args.vocab_size < 100_000 + +if args.sample_size: + raw_datasets = load_dataset( + "oscar", + f"unshuffled_deduplicated_{lang}", + cache_dir=args.hf_cache_dir + )["train"].shuffle(seed=42).select(range(args.sample_size)) + +else: + raw_datasets = load_dataset( + "oscar", + f"unshuffled_deduplicated_{lang}", + cache_dir=args.hf_cache_dir + )["train"] + +print(f"✅ Loaded raw_datasets OSCAR language {lang}") + +def batch_iterator(): + batch_size = 1000 + for i in range(0, len(raw_datasets), batch_size): + yield raw_datasets[i : i + batch_size]["text"] + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +assert tokenizer.is_fast +new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) +print("✅ Trained tokenizer with len ", len(new_tokenizer)) + +new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") +print(f"✅ Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") diff --git a/scripts/madx_exp/madxlastlayer_lngembft_clm.py b/scripts/madx_exp/madxlastlayer_lngembft_clm.py new file mode 100644 index 0000000..7234cea --- /dev/null +++ b/scripts/madx_exp/madxlastlayer_lngembft_clm.py @@ -0,0 +1,618 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def load_tokenizer(model_args): + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + return tokenizer + + + +def load_data(data_args, model_args): + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + + if "validation" not in raw_datasets.keys(): + if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + elif data_args.max_eval_samples is not None : + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + else: + raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) + + raw_datasets['validation'] = raw_datasets['test'] + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + return raw_datasets + +def load_model(model_args, tokenizer): + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + #TODO: remap embedding parameters + #if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + model.resize_token_embeddings(len(tokenizer)) + return model + +def preprocess_data(training_args, data_args, model_args, tokenizer): + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") + + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info("Sanity check: loaded tokenized_datasets") + else: + raw_datasets = load_data(data_args, model_args) + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + + text_column_name = "text" if "text" in column_names else column_names[0] + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info("Sanity check: saved tokenized_datasets") + if "train" not in tokenized_datasets and training_args.do_train: + raise ValueError("--do_train requires a train dataset") + if "validation" not in tokenized_datasets and training_args.do_eval: + raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets + + +def get_lm_dataset(training_args, data_args, model_args, tokenizer): + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("Sanity check: loaded lm_datasets") + else: + + tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("Sanity check: saved lm_datasets") + return lm_datasets + +def add_adapters(adapter_args, data_args, model): + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + leave_out = [i for i in range(0,23)] + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + trainable_params = 0 + frozen_params = 0 + emb_params = 0 + for name, param in model.named_parameters(): + if not param.requires_grad: + if not "wte" in name and not "lm_head" in name: + print(f"🥶 Frozen layer '{name}'") + frozen_params +=param.numel() + else: + param.requires_grad = True + print(f"🚀 Trainable layer '{name}'") + emb_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + print(f"Total frozen parameters: {frozen_params}") + print(f"Total emb parameters: {emb_params}") + print(f"Total trainable parameters: {trainable_params}") + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + training_args.data_dir = f'{training_args.output_dir}/../' + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + pass + #raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + #) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + tokenizer = load_tokenizer(model_args) + model = load_model(model_args, tokenizer) + + add_adapters(adapter_args, data_args, model) + # Preprocessing the datasets. + lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) + if training_args.do_train: + train_dataset = lm_datasets["train"] + + if training_args.do_eval: + + eval_dataset = lm_datasets["validation"] + + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + +# if training_args.push_to_hub: +# trainer.push_to_hub(**kwargs) +# else: +# trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() From 7069276ceec6b146324b5c69f9aa3290e62d190e Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:23:56 +0200 Subject: [PATCH 002/142] added script to train tokenizer only on a subset of the dataset --- scripts/lang_adapt/train_tokenizer.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 scripts/lang_adapt/train_tokenizer.sh diff --git a/scripts/lang_adapt/train_tokenizer.sh b/scripts/lang_adapt/train_tokenizer.sh new file mode 100644 index 0000000..7a95182 --- /dev/null +++ b/scripts/lang_adapt/train_tokenizer.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +lng=$1 +sample_size=$2 +vocab_size=$3 +source $bs_dir/multilingual-modeling/scripts/env/bin/activate +python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size + From ad6d511f6c00f0da742696cb8acf8125372d698c Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:27:34 +0200 Subject: [PATCH 003/142] updated instructions for samples tokenizer --- scripts/lang_adapt/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index afc084d..f2ed6c2 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -7,6 +7,14 @@ Run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. - `vocab_size`: vocab size of the tokenizer + +Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dataset. +- `lang`: language name (e.g., "de", "th") +- `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `{lang}_oscar_tokenizer_{vocab_size}` +- `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. +- `vocab_size`: vocab size of the tokenizer +- `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) + ### Language Adaptation (6 Combinations) - use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. - use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. From 4e1f13710428f8d7af0456bdf93c59ef638632f2 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:29:52 +0200 Subject: [PATCH 004/142] updated training script: added some extra parameters in the running script, and changed the slurm running settings --- scripts/lang_adapt/run_clm_adpt_vn.sh | 83 +++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 scripts/lang_adapt/run_clm_adpt_vn.sh diff --git a/scripts/lang_adapt/run_clm_adpt_vn.sh b/scripts/lang_adapt/run_clm_adpt_vn.sh new file mode 100644 index 0000000..44d12af --- /dev/null +++ b/scripts/lang_adapt/run_clm_adpt_vn.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J run_clm_madx + +# Specify an output file +#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_madx-%j.out +#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_madx-%j.err + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +data_sample=$1 +ch=118500 +lng=$2 +adapter_reduction_factor=$3 +dataset=oscar +adapter_config="pfeiffer+inv" +vocabsize=24000 +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +tokenizer_dir="${FP_BIGS}/tokenizers/${lng}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +cache_dir="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${lng}/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}-$( basename $tokenizer_dir )" +data_tok_dir=${data_dir}/lng_tok + +output_dir="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +logging_dir="${FP_BIGS}/logs/exp-ext-${lng}/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}-$( basename $tokenizer_dir )/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +echo $output_dir + +BIGS_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name + + +mkdir -p $output_dir +mkdir -p $logging_dir + +adapter_config="pfeiffer+inv" +python $FP_BIGS/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ + --seed 0 \ + --fp16 \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_${lng}" \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "epoch" \ + --max_eval_samples 5000 \ + --save_steps 10000 \ + --save_strategy "steps" \ + --save_total_limit 3 \ + --max_train_samples $data_sample \ + --max_steps 50000 \ + --train_adapter \ + --load_best_model_at_end \ + --lang_adapt_strategies "emb-and-adpt" \ + --embedding_strategies "overlap-replace" \ + --adapter_reduction_factor $adapter_reduction_factor \ + --adapter_config ${adapter_config} \ + --language $lng From 7ff1c18bf7a72b1b9053bc9375c023caae9a58b7 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:35:34 +0200 Subject: [PATCH 005/142] added overlap-replace parameter, added possibility to save embedding layer (instead of whole model), added early stopping, --- scripts/lang_adapt/madx_run_clm.py | 80 +++++++++++++++++------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index bcea14c..de46184 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -16,6 +16,8 @@ from datasets import load_dataset import transformers +from transformers import EarlyStoppingCallback + import transformers.adapters.composition as ac from transformers import ( CONFIG_MAPPING, @@ -105,7 +107,7 @@ class ModelArguments: ) embedding_strategies: str = field( default="", - metadata={"help": "choose one of the two strategies - 'replace', 'extend'"}, + metadata={"help": "choose one of the two strategies - 'replace', 'extend', 'overlap-replace'"}, ) def __post_init__(self): @@ -242,9 +244,9 @@ def load_data(data_args, model_args): if "validation" not in raw_datasets.keys(): if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: - raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples*2, test_size = data_args.max_eval_samples*2) elif data_args.max_eval_samples is not None : - raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples*2) else: raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) raw_datasets['validation'] = raw_datasets['test'] @@ -256,12 +258,12 @@ def load_data(data_args, model_args): # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. - - if data_args.max_train_samples is not None: + if data_args.max_train_samples is not None and len(raw_datasets['train']) > data_args.max_train_samples: # FIXME: currently assume the loaded checkpoint is trained with the first data_args.max_train_samples number of samples - raw_datasets["train"] = raw_datasets["train"].filter(lambda example, indice: indice < data_args.max_train_samples, with_indices=True) + #raw_datasets["train"] = raw_datasets["train"].filter(lambda example, indice: indice < data_args.max_train_samples, with_indices=True) raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) - if data_args.max_eval_samples is not None: + + if data_args.max_eval_samples is not None and len(raw_datasets['validation']) > data_args.max_eval_samples: raw_datasets["validation"] = raw_datasets["validation"].select(range(data_args.max_eval_samples)) return raw_datasets @@ -297,16 +299,12 @@ def load_model(model_args, tokenizer): logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") #TODO: remap embedding parameters - #if not tokenizer.name_or_path == model_args.model_name_or_path: - # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - - model.resize_token_embeddings(len(tokenizer)) + return model def preprocess_data(training_args, data_args, model_args, tokenizer): with training_args.main_process_first(desc="dataset map tokenization"): saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_data.pt") - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) logger.info(f"✅ loaded tokenized_data") @@ -314,7 +312,7 @@ def preprocess_data(training_args, data_args, model_args, tokenizer): raw_datasets = load_data(data_args, model_args) assert len(raw_datasets['train']) == data_args.max_train_samples logger.info(f"🧠 Sanity check: loaded raw datasets have {data_args.max_train_samples} samples") - + # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names @@ -343,8 +341,10 @@ def tokenize_function(examples): load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) logger.info(f"✅ saved tokenized_data") + if "train" not in tokenized_datasets and training_args.do_train: raise ValueError("--do_train requires a train dataset") if "validation" not in tokenized_datasets and training_args.do_eval: @@ -408,17 +408,18 @@ def group_texts(examples): ) torch.save(lm_datasets, saved_lm_datasets_fp) logger.info("✅ saved lm_data") - print(lm_datasets) return lm_datasets -def modify_model(adapter_args, data_args, model_args, model): - if model_args.lang_adapt_strategies == "emb": - for name, param in model.named_parameters(): - if "wte" not in name and "wpe" not in name: - param.requires_grad = False +def modify_model(adapter_args, data_args, model_args, tokenizer, model): + #if "emb" in model_args.lang_adapt_strategies: + # if "replace" in model_args.embedding_strategies: + # for name, param in model.named_parameters(): + # if "wte" not in name and "wpe" not in name and "lm_head" not in name: + # param.requires_grad = False + # Setup adapters - elif adapter_args.train_adapter: + if adapter_args.train_adapter: task_name = data_args.dataset_name or "clm" task_name += f"_{adapter_args.language}" # check if adapter already exists, otherwise add it @@ -456,18 +457,29 @@ def modify_model(adapter_args, data_args, model_args, model): else: lang_adapter_name = None # Freeze all model weights except of those of this adapter - model.train_adapter([task_name]) + model.train_adapter(task_name, train_embeddings=True) # Set the adapters to be used in every forward pass - if lang_adapter_name: - model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) - else: - model.set_active_adapters(task_name) + #if lang_adapter_name: + # model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + #else: + # model.set_active_adapters(task_name) + else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training" ) + + if model_args.embedding_strategies == "overlap-replace": + if not tokenizer.name_or_path == model_args.model_name_or_path: + orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + model.add_embeddings('lng_emb', tokenizer, reference_embedding='default', reference_tokenizer=orig_tokenizer ) + model._active_embedding = "lng_emb" + model.delete_embeddings('default') + model.tie_weights() + elif model_args.embedding_strategies == "replace": + model.resize_token_embeddings(len(tokenizer)) trainable_params = 0 frozen_params = 0 emb_params = 0 @@ -478,7 +490,7 @@ def modify_model(adapter_args, data_args, model_args, model): else: print(f"🚀 Trainable layer '{name}'") trainable_params += param.numel() - + if "wte" and "wpe" in name: emb_params += param.numel() @@ -504,7 +516,7 @@ def main(): training_args.data_dir = f'{training_args.output_dir}' assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') - assert model_args.embedding_strategies in ('replace', 'extend') + assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') # Setup logging logging.basicConfig( @@ -551,8 +563,7 @@ def main(): tokenizer = load_tokenizer(model_args) model = load_model(model_args, tokenizer) - - modify_model(adapter_args, data_args, model_args, model) + modify_model(adapter_args, data_args, model_args, tokenizer, model) # Preprocessing the datasets. lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) if training_args.do_train: @@ -560,8 +571,6 @@ def main(): if training_args.do_eval: eval_dataset = lm_datasets["validation"] - - # Initialize our Trainer trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer trainer = trainer_class( @@ -571,7 +580,7 @@ def main(): eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, + data_collator=default_data_collator ) logger.info(model) @@ -583,8 +592,11 @@ def main(): checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint + trainer.add_callback(EarlyStoppingCallback(3)) train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload + trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) + + trainer.model.save_embeddings(f'{trainer.args.output_dir}/embedding_layer') metrics = train_result.metrics @@ -635,4 +647,4 @@ def _mp_fn(index): if __name__ == "__main__": - main() \ No newline at end of file + main() From afb108df94d74e7e28bad6979a69456ec1d216a3 Mon Sep 17 00:00:00 2001 From: yongzx Date: Fri, 6 May 2022 08:03:32 -0400 Subject: [PATCH 006/142] update madx_run_clm --- scripts/lang_adapt/madx_run_clm.py | 117 ++++++++++++++++------------- 1 file changed, 63 insertions(+), 54 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index bcea14c..d2c50fa 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -408,7 +408,6 @@ def group_texts(examples): ) torch.save(lm_datasets, saved_lm_datasets_fp) logger.info("✅ saved lm_data") - print(lm_datasets) return lm_datasets def modify_model(adapter_args, data_args, model_args, model): @@ -416,71 +415,75 @@ def modify_model(adapter_args, data_args, model_args, model): for name, param in model.named_parameters(): if "wte" not in name and "wpe" not in name: param.requires_grad = False - - # Setup adapters - elif adapter_args.train_adapter: - task_name = data_args.dataset_name or "clm" - task_name += f"_{adapter_args.language}" - # check if adapter already exists, otherwise add it - if task_name not in model.config.adapters: - # resolve the adapter config - adapter_config = AdapterConfig.load( - adapter_args.adapter_config, - non_linearity=adapter_args.adapter_non_linearity, - reduction_factor=adapter_args.adapter_reduction_factor, - ) - # load a pre-trained from Hub if specified - if adapter_args.load_adapter: - model.load_adapter( - adapter_args.load_adapter, - config=adapter_config, - load_as=task_name, + elif model_args.lang_adapt_strategies == "emb-and-adpt": + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, ) - # otherwise, add a fresh adapter else: - model.add_adapter(task_name, config=adapter_config) - # optionally load a pre-trained language adapter - if adapter_args.load_lang_adapter: - # resolve the language adapter config - lang_adapter_config = AdapterConfig.load( - adapter_args.lang_adapter_config, - non_linearity=adapter_args.lang_adapter_non_linearity, - reduction_factor=adapter_args.lang_adapter_reduction_factor, - ) - # load the language adapter from Hub - lang_adapter_name = model.load_adapter( - adapter_args.load_lang_adapter, - config=lang_adapter_config, - load_as=adapter_args.language, - ) - else: - lang_adapter_name = None - # Freeze all model weights except of those of this adapter - model.train_adapter([task_name]) - # Set the adapters to be used in every forward pass - if lang_adapter_name: - model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + lang_adapter_name = None + + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) else: - model.set_active_adapters(task_name) - else: - if adapter_args.load_adapter or adapter_args.load_lang_adapter: - raise ValueError( - "Adapters can only be loaded in adapters training mode." - "Use --train_adapter to enable adapter training" - ) + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) trainable_params = 0 frozen_params = 0 emb_params = 0 for name, param in model.named_parameters(): + if "wte" in name or "wpe" in name: + param.requires_grad = True + emb_params += param.numel() + if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") frozen_params += param.numel() else: print(f"🚀 Trainable layer '{name}'") trainable_params += param.numel() - - if "wte" and "wpe" in name: - emb_params += param.numel() print(f"Total frozen parameters: {frozen_params}") print(f"Total emb parameters (wte, wpe): {emb_params}") @@ -574,7 +577,8 @@ def main(): data_collator=default_data_collator, ) - logger.info(model) + print("Model: 👇") + print(model) # Training if training_args.do_train: @@ -585,6 +589,11 @@ def main(): checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload + + # Saves the model's embedding layer + for name, param in trainer.model.named_parameters(): + if "wte" in name or "wpe" in name: + torch.save(param.data, f"{training_args.output_dir}/{name}.pt") metrics = train_result.metrics From a79bfd060745f69f4eeab6a5ea0811fcda3cf0af Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 6 May 2022 15:35:58 +0200 Subject: [PATCH 007/142] adapted xnli script to properly load wte, wpe and adapters --- scripts/eval_xnli/adapters_xnli_de_vn.py | 228 +++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 scripts/eval_xnli/adapters_xnli_de_vn.py diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py new file mode 100644 index 0000000..45ae562 --- /dev/null +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -0,0 +1,228 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.do_train: + if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + + + small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) + small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) + logger.info(full_train_dataset[0]) + logger.info(full_train_dataset[100]) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + if not args.original_model == args.pretrained_model: + wte = torch.load(f'{args.pretrained_model}/embedding.pt') + wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') + + model = GPT2ForSequenceClassification.from_pretrained(args.original_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if not args.zero_shot: + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.pretrained_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe + if args.madx_lang_adapter: + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) + + if not inference: + #if not args.zero_shot: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) From c9a8cec60c4dab4f95769705ba58d2650a73688d Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 6 May 2022 15:37:39 +0200 Subject: [PATCH 008/142] updated the way we save the model; added fp16 training --- scripts/lang_adapt/madx_run_clm.py | 4 +++- scripts/lang_adapt/run_clm_adpt_vn.sh | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index de46184..36e610f 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -596,7 +596,9 @@ def main(): train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) - trainer.model.save_embeddings(f'{trainer.args.output_dir}/embedding_layer') + # save embedding and positional embedding (which is not saved by trainer) + trainer.model.save_embeddings(trainer.args.output_dir, 'lng_emb') + torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/positional_embedding.pt') metrics = train_result.metrics diff --git a/scripts/lang_adapt/run_clm_adpt_vn.sh b/scripts/lang_adapt/run_clm_adpt_vn.sh index 44d12af..c585c22 100644 --- a/scripts/lang_adapt/run_clm_adpt_vn.sh +++ b/scripts/lang_adapt/run_clm_adpt_vn.sh @@ -28,7 +28,7 @@ lng=$2 adapter_reduction_factor=$3 dataset=oscar adapter_config="pfeiffer+inv" -vocabsize=24000 +vocabsize=1000 model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" tokenizer_dir="${FP_BIGS}/tokenizers/${lng}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k cache_dir="${FP_BIGS}/data/" @@ -70,9 +70,9 @@ python $FP_BIGS/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ --evaluation_strategy "epoch" \ --max_eval_samples 5000 \ --save_steps 10000 \ - --save_strategy "steps" \ - --save_total_limit 3 \ - --max_train_samples $data_sample \ + --save_strategy "epoch" \ + --save_total_limit 3 \ + --max_train_samples ${data_sample}\ --max_steps 50000 \ --train_adapter \ --load_best_model_at_end \ @@ -80,4 +80,4 @@ python $FP_BIGS/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ --embedding_strategies "overlap-replace" \ --adapter_reduction_factor $adapter_reduction_factor \ --adapter_config ${adapter_config} \ - --language $lng + --language $lng &> $output_dir/train.log From 3e8bd62d82dc7eb1a6358843c9a924dd8d3457bc Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Mon, 9 May 2022 14:10:00 -0400 Subject: [PATCH 009/142] add xlsum script (version #1) --- scripts/eval_xnli/adapters_xlsum_de.py | 266 +++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 scripts/eval_xnli/adapters_xlsum_de.py diff --git a/scripts/eval_xnli/adapters_xlsum_de.py b/scripts/eval_xnli/adapters_xlsum_de.py new file mode 100644 index 0000000..e396964 --- /dev/null +++ b/scripts/eval_xnli/adapters_xlsum_de.py @@ -0,0 +1,266 @@ +import argparse +import os +import sys +from loguru import logger + +from datasets import load_dataset +from datasets import load_metric + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel, AutoModelForCausalLM + + +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="german") #xlsum requires a language name, not language code +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +parser.add_argument("--revision", type=str, default="main") +parser.add_argument("--local_rank", type=int, default=0) + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +parser.add_argument("--deepspeed", required=False) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +torch.cuda.set_device(args.local_rank) + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigscience model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load xlsum dataset +if args.zero_shot: + print("Cross Lingual") + en_dataset = load_dataset("xlsum", "english", cache_dir=args.cache_dir) + dataset = load_dataset("xlsum", args.lang, cache_dir=args.cache_dir) + + train_dataset = en_dataset["train"] + val_dataset = en_dataset["validation"] + test_dataset = dataset["test"] +else: + print("Supervised training") + dataset = load_dataset("xlsum", args.lang, cache_dir=args.cache_dir) + + train_dataset = dataset["train"] + val_dataset = dataset["validation"] + test_dataset = dataset["test"] + + +# load tokenizer + +# if args.revision is not None: +# print("revision: ", args.revision) +# tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision) + +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision) + +tokenizer.pad_token = tokenizer.eos_token + +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision) + + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(example): + inputs = tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) + + with tokenizer.as_target_tokenizer(): + summaries = tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + + inputs["labels"] = summaries["input_ids"] + + return inputs + +if args.zero_shot: + def en_tokenize_function(example): + inputs = en_tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) + + with en_tokenizer.as_target_tokenizer(): + summaries = en_tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + + inputs["labels"] = summaries["input_ids"] + + return inputs + +logger.info("tokenizing dataset...") + +full_train_dataset = train_dataset.map(tokenize_function, batched=False) #TODO: unbatch this? +full_val_dataset = val_dataset.map(tokenize_function, batched=False) +full_test_dataset = test_dataset.map(tokenize_function, batched=False) + +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + + +logger.info(full_train_dataset[0]) +logger.info(full_val_dataset[0]) + +metric = load_metric("rouge", cache_dir=args.cache_dir) + +def compute_metrics(eval_preds): ##TODO: implement this + preds, labels = eval_preds + + return metric(preds, labels) + + +training_args = TrainingArguments( + output_dir=args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else None, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, + deepspeed=args.deepspeed, +) + +def load_model(args, inference=False): + + # Hack for loading wte module not needed here, since using a causal language model class + if args.zero_shot and not inference: + model = GPT2LMHeadModel.from_pretrained(args.pretrained_model, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision) + else: + model = GPT2LMHeadModel.from_pretrained(args.pretrained_model, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision) + if not args.zero_shot or (args.zero_shot and inference): + # if not zero shot, that means that we need to replace the embedding layers during training + # we also need to replace embedding layers during inference + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model, revision=args.revision) + + # change the embedding layer of the original big science model + # by loading the adapters (which has saved lm_head) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if args.madx_lang_adapter: + causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + + # model has original bigscience embedding so replace it. + model.resize_token_embeddings(len(tokenizer)) + model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + + # TODO: change the logic here for loading/training the adapters + if not inference: + if not args.zero_shot: + if args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + if args.finetune_strategies == "whole": + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "lang_adapters": + model.train_adapter([args.adapter_lang_name]) + elif args.finetune_strategies == "task_adapters": + model.add_adapter("xlsum-task-adapter") + model.train_adapter("xlsum-task-adapter") + else: + raise ValueError("invalid configuration") + + print("🔥 ==================== Training: ==================== 🔥") + # for name, param in model.named_parameters(): + # if not param.requires_grad: + # print(f"🥶 Frozen layer '{name}'") + # else: + # print(f"🚀 Trainable layer '{name}'") + # print(model) + else: + print("🔥 ==================== Inference: ==================== 🔥") + if args.finetune_strategies == "lang_adapters": + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "task_adapters": + if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(args.madx_lang_adapter) + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xlsum-task-adapter") + model.set_active_adapters(adapter_name) + else: + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xlsum-task-adapter") #TODO: change the argument to this + model.set_active_adapters(adapter_name) + # print(model) + + + return model + +if args.do_train: + logger.info("Starting training...") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics, + ) + + trainer.train() + +if args.do_predict: + if arg.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith("checkpoint-")], + key=lambda x: int(x[len('checkpoint-'):]))) + assert len(evaluation_dirs) > 0 + logger.info(f"Found {len(evaluation_dirs)} checkpoints") + + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics, + ) + + print("Evaluating on test set...", trainer.evaluate()) From 2cd27a33bceaf0a9629ee00174f63ffeec4e56bc Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Mon, 9 May 2022 14:42:25 -0400 Subject: [PATCH 010/142] add unified eval script --- scripts/eval_xnli/adapters_eval.py | 292 ++++++++++++++++++++++++++ scripts/eval_xnli/crosslingual_exp.sh | 40 ++++ 2 files changed, 332 insertions(+) create mode 100644 scripts/eval_xnli/adapters_eval.py create mode 100644 scripts/eval_xnli/crosslingual_exp.sh diff --git a/scripts/eval_xnli/adapters_eval.py b/scripts/eval_xnli/adapters_eval.py new file mode 100644 index 0000000..649ce1b --- /dev/null +++ b/scripts/eval_xnli/adapters_eval.py @@ -0,0 +1,292 @@ +import argparse +import os +import sys +from loguru import logger + +from datasets import load_dataset +from datasets import load_metric + +import torch +import numpy as np +from transformers import TrainingArguments, AdapterTrainer +from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2ForSequenceClassification, AutoModelForCausalLM + + +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="german") #xlsum requires a language name, not language code + +tasks = ["xnli", "xlsum"] +parser.add_argument("--dataset", choices=tasks, required=True) + +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +parser.add_argument("--revision", type=str, default="main") +parser.add_argument("--local_rank", type=int) + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +parser.add_argument("--deepspeed", required=False) + +# mapping of task to model_class +model_class_mapping = {"xnli": GPT2ForSequenceClassification, "xlsum": GPT2LMHeadModel} + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.local_rank: + torch.cuda.set_device(args.local_rank) + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigscience model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + +# load appropriate dataset +logger.info("Loading dataset...") + +# will need to rename splits if the dataset has different name for validation set +if args.zero_shot: + print("Cross Lingual") + # cross lingual: use english as train and validation set + en_dataset = load_dataset(args.dataset, "english", cache_dir=args.cache_dir) + dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) + + train_dataset = en_dataset["train"] + val_dataset = en_dataset["validation"] + test_dataset = dataset["test"] +else: + print("Supervised training") + dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) + + train_dataset = dataset["train"] + val_dataset = dataset["validation"] + test_dataset = dataset["test"] + +logger.info("Loading tokenizer...") +# load tokenizer + +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision) +tokenizer.pad_token = tokenizer.eos_token + +def tokenize_function(example): + inputs = tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) + + with tokenizer.as_target_tokenizer(): + summaries = tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + + inputs["labels"] = summaries["input_ids"] + + return inputs + +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision) + en_tokenizer.pad_token = en_tokenizer.eos_token + + if args.dataset == "xnli": + elif args.dataset == "xlsum": + def en_tokenize_function(example): + inputs = en_tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) + + with en_tokenizer.as_target_tokenizer(): + summaries = en_tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + + inputs["labels"] = summaries["input_ids"] + + return inputs + + + +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) + + +# load metric +logger.info("Loading metric...") + +if args.dataset == "xnli": + metric = load_metric("xnli") + + def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +elif args.dataset == "xlsum": + metric = load_metric("rouge", cache_dir=args.cache_dir) + + def compute_metrics(eval_preds): + preds, labels = eval_preds + + return metric(preds, labels) + +else: + raise ValueError("Unknown dataset provided") + + +training_args = TrainingArguments( + output_dir=args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else None, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, + deepspeed=args.deepspeed, +) + +# TODO: double-check the adapter loading logic here +def load_model(args, inference=False): + + # Hack for loading wte module not needed here, since using a causal language model class + if args.zero_shot and not inference: + model = model_class_mapping[args.task].from_pretrained(args.pretrained_model, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision) + else: + model = model_class_mapping[args.task].from_pretrained(args.pretrained_model, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision) + if not args.zero_shot or (args.zero_shot and inference): + # if not zero shot, that means that we need to replace the embedding layers during training + # we also need to replace embedding layers during inference + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model, revision=args.revision) + + # change the embedding layer of the original big science model + # by loading the adapters (which has saved lm_head) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if args.madx_lang_adapter: + causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + + # model has original bigscience embedding so replace it. + model.resize_token_embeddings(len(tokenizer)) + model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + + if not inference: + if not args.zero_shot: + if args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + if args.finetune_strategies == "whole": + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "lang_adapters": + model.train_adapter([args.adapter_lang_name]) + elif args.finetune_strategies == "task_adapters": + model.add_adapter("xlsum-task-adapter") + model.train_adapter("xlsum-task-adapter") + else: + raise ValueError("invalid configuration") + + print("🔥 ==================== Training: ==================== 🔥") + # for name, param in model.named_parameters(): + # if not param.requires_grad: + # print(f"🥶 Frozen layer '{name}'") + # else: + # print(f"🚀 Trainable layer '{name}'") + # print(model) + else: + print("🔥 ==================== Inference: ==================== 🔥") + if args.finetune_strategies == "lang_adapters": + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "task_adapters": + if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(args.madx_lang_adapter) + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xlsum-task-adapter") + model.set_active_adapters(adapter_name) + else: + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xlsum-task-adapter") #TODO: change the argument to this + model.set_active_adapters(adapter_name) + # print(model) + + + return model + + +if args.do_train: + logger.info("Starting training...") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics, + ) + + trainer.train() + + + +if args.do_predict: + if arg.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith("checkpoint-")], + key=lambda x: int(x[len('checkpoint-'):]))) + assert len(evaluation_dirs) > 0 + logger.info(f"Found {len(evaluation_dirs)} checkpoints") + + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics, + ) + + print("Evaluating on test set...", trainer.evaluate()) diff --git a/scripts/eval_xnli/crosslingual_exp.sh b/scripts/eval_xnli/crosslingual_exp.sh new file mode 100644 index 0000000..8f1bafe --- /dev/null +++ b/scripts/eval_xnli/crosslingual_exp.sh @@ -0,0 +1,40 @@ +OUTPUT_DIR=./xlsum_ckpts # where to save checkpoints +LANG="thai" # language name, e.g. "thai" not "th" for xlsum. language code e.g. "de" for xnli. +TASK="xlsum" # xlsum or xnli +CACHE_DIR=~/.cache/huggingface/ # cache dir for saving/loading HF models and datasets +LR=1e-5 +MODEL_NAME="bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +TOKENIZER_NAME="bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +REVISION="global_step118500" # branch name, e.g. "global_step118500", if applicable + +DEEPSPEED_CONFIG="./deepspeed_config.json" # deepspeed config file, if using deepspeed +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="" + +# only finetune task adapters +FT_STRATEGIES="task_adapters" + + +mkdir -p $OUTPUT_DIR +deepspeed --include localhost:0 adapters_xlsum_de.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--dataset $TASK \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 1 \ +--gradient_accumulation_steps 1 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--use_partial_data \ +--zero_shot \ +--revision "$REVISION" \ +--adapter_lang_name "xlsum-de" \ +--finetune_strategies $FT_STRATEGIES \ +# --use_partial_data +# --deepspeed $DEEPSPEED_CONFIG + +# --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ \ No newline at end of file From fac77dc3d7ccbed44d0a6c041e758674e5777670 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Mon, 9 May 2022 15:12:28 -0400 Subject: [PATCH 011/142] xlsum separate script --- scripts/eval_xnli/adapters_xlsum_de.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xlsum_de.py b/scripts/eval_xnli/adapters_xlsum_de.py index e396964..6d434e3 100644 --- a/scripts/eval_xnli/adapters_xlsum_de.py +++ b/scripts/eval_xnli/adapters_xlsum_de.py @@ -15,7 +15,6 @@ logger.remove() logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - # parser parser = argparse.ArgumentParser() parser.add_argument("output_dir") From 96339f4811c53e628ae2865b092ef1b91fa25e94 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Mon, 9 May 2022 19:14:48 -0400 Subject: [PATCH 012/142] script bugfixes --- scripts/eval_xnli/adapters_eval.py | 52 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/scripts/eval_xnli/adapters_eval.py b/scripts/eval_xnli/adapters_eval.py index 649ce1b..5379845 100644 --- a/scripts/eval_xnli/adapters_eval.py +++ b/scripts/eval_xnli/adapters_eval.py @@ -28,6 +28,7 @@ parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--per_device_eval_batch_size", type=int, default=1) parser.add_argument("--pretrained_model") parser.add_argument("--original_model") parser.add_argument("--tokenizer") @@ -68,16 +69,16 @@ # will need to rename splits if the dataset has different name for validation set if args.zero_shot: - print("Cross Lingual") + print("0️⃣ Cross Lingual") # cross lingual: use english as train and validation set - en_dataset = load_dataset(args.dataset, "english", cache_dir=args.cache_dir) + en_dataset = load_dataset(args.dataset, "english" if args.dataset == "xlsum" else "en", cache_dir=args.cache_dir) dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) train_dataset = en_dataset["train"] val_dataset = en_dataset["validation"] test_dataset = dataset["test"] else: - print("Supervised training") + print("👀 Supervised training") dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) train_dataset = dataset["train"] @@ -90,21 +91,29 @@ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision) tokenizer.pad_token = tokenizer.eos_token -def tokenize_function(example): - inputs = tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) +if args.dataset == "xnli": + def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - with tokenizer.as_target_tokenizer(): - summaries = tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) - - inputs["labels"] = summaries["input_ids"] +elif args.dataset == "xlsum": + def tokenize_function(example): + inputs = tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) - return inputs + with tokenizer.as_target_tokenizer(): + summaries = tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + + inputs["labels"] = summaries["input_ids"] + + return inputs if args.zero_shot: en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision) en_tokenizer.pad_token = en_tokenizer.eos_token if args.dataset == "xnli": + def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + elif args.dataset == "xlsum": def en_tokenize_function(example): inputs = en_tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) @@ -164,6 +173,7 @@ def compute_metrics(eval_preds): eval_steps=500 if not args.use_partial_data else None, num_train_epochs=args.num_train_epochs, per_device_train_batch_size=args.per_device_train_batch_size, + per_device_eval_batch_size=args.per_device_eval_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, evaluation_strategy="epoch", @@ -181,12 +191,14 @@ def load_model(args, inference=False): # Hack for loading wte module not needed here, since using a causal language model class if args.zero_shot and not inference: - model = model_class_mapping[args.task].from_pretrained(args.pretrained_model, + model = model_class_mapping[args.dataset].from_pretrained(args.pretrained_model, + num_labels=3 if args.dataset == "xnli" else None, pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir, revision=args.revision) else: - model = model_class_mapping[args.task].from_pretrained(args.pretrained_model, + model = model_class_mapping[args.dataset].from_pretrained(args.pretrained_model, + num_labels=3 if args.dataset == "xnli" else None, pad_token_id=tokenizer.pad_token_id, cache_dir=args.cache_dir, revision=args.revision) @@ -216,8 +228,8 @@ def load_model(args, inference=False): elif args.finetune_strategies == "lang_adapters": model.train_adapter([args.adapter_lang_name]) elif args.finetune_strategies == "task_adapters": - model.add_adapter("xlsum-task-adapter") - model.train_adapter("xlsum-task-adapter") + model.add_adapter(f"{args.dataset}-task-adapter") + model.train_adapter(f"{args.dataset}-task-adapter") else: raise ValueError("invalid configuration") @@ -239,10 +251,10 @@ def load_model(args, inference=False): assert args.pretrained_adapters_dir adapter_name = model.load_adapter(args.madx_lang_adapter) model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xlsum-task-adapter") + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset}-task-adapter") model.set_active_adapters(adapter_name) else: - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xlsum-task-adapter") #TODO: change the argument to this + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset}-task-adapter") #TODO: change the argument to this model.set_active_adapters(adapter_name) # print(model) @@ -266,7 +278,7 @@ def load_model(args, inference=False): if args.do_predict: - if arg.do_eval_after_train: + if args.do_eval_after_train: evaluation_dirs = list(sorted([ checkpoint_dir for checkpoint_dir in os.listdir(args.output_dir) @@ -275,9 +287,9 @@ def load_model(args, inference=False): assert len(evaluation_dirs) > 0 logger.info(f"Found {len(evaluation_dirs)} checkpoints") - if args.madx_lang_adapter: - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + # load the last checkpoint. TODO: make sure this still should be done even if no madx adapter is used + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") model = load_model(args, inference=True) training_args.report_to = list() From 043ece77752c82d0a5293ea44d238c162b430367 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 11 May 2022 00:31:40 -0400 Subject: [PATCH 013/142] change zero_shot to cross_lingual --- scripts/eval_xnli/adapters_xnli_de_vn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index 45ae562..d634e5c 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -34,7 +34,7 @@ parser.add_argument("--do_eval_after_train", default=False, action="store_true") parser.add_argument("--do_predict", default=False, action="store_true") parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") +parser.add_argument("--cross_lingual", default=False, action="store_true") finetune_strategies = ["whole", "lang_adapters", "task_adapters"] parser.add_argument("--madx_lang_adapter") @@ -54,7 +54,7 @@ # load dataset -if args.zero_shot: +if args.cross_lingual: print("0️⃣ 0-Shot") # 0-shot: use english as train and validation xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) @@ -75,7 +75,7 @@ # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: +if args.cross_lingual: en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer en_tokenizer.pad_token = en_tokenizer.eos_token @@ -88,7 +88,7 @@ def en_tokenize_function(examples): logger.info("Tokenizing the dataset...") if args.do_train: - if args.zero_shot: + if args.cross_lingual: full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) else: @@ -144,7 +144,7 @@ def load_model(args, inference=False): pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir) - if not args.zero_shot: + if not args.cross_lingual: causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) if not args.original_model == args.pretrained_model: @@ -156,7 +156,7 @@ def load_model(args, inference=False): model.set_active_adapters(adapter_name) if not inference: - #if not args.zero_shot: normally need to add adapter in any case + #if not args.cross_lingual: normally need to add adapter in any case # normally this is already done, why use adapter_lang_name here? #if args.madx_lang_adapter: # adapter_name = model.load_adapter(args.madx_lang_adapter, From 6ac743aefb1aa44ed7dbf78e1392808a75078293 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 11 May 2022 00:46:12 -0400 Subject: [PATCH 014/142] load language adapters during inference setting --- scripts/eval_xnli/adapters_xnli_de_vn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index d634e5c..b547509 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -144,7 +144,7 @@ def load_model(args, inference=False): pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir) - if not args.cross_lingual: + if inference or not args.cross_lingual: causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) if not args.original_model == args.pretrained_model: From d4b0e3039bc1cdbc280f7e815828b13739980735 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:19:56 +0200 Subject: [PATCH 015/142] updated tokenizer training script --- scripts/lang_adapt/tokenized4clm_sampled.py | 39 +++++++++++++++---- scripts/lang_adapt/train_tokenizer_scratch.sh | 17 ++++++++ scripts/lang_adapt/train_tokenizer_update.sh | 17 ++++++++ 3 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 scripts/lang_adapt/train_tokenizer_scratch.sh create mode 100644 scripts/lang_adapt/train_tokenizer_update.sh diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 672277a..775815e 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -30,6 +30,7 @@ parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) parser.add_argument('--vocab_size', default=130_000, type=int) parser.add_argument('--extend_vocab', action='store_true') +parser.add_argument('--replace_with_overlap', action='store_true') # this is not working as expected parser.add_argument('--sample_size', default=None, type=int) args = parser.parse_args() @@ -54,14 +55,38 @@ print(f"✅ Loaded raw_datasets OSCAR language {lang}") def batch_iterator(): + global unique_toks batch_size = 1000 for i in range(0, len(raw_datasets), batch_size): - yield raw_datasets[i : i + batch_size]["text"] + sample = raw_datasets[i : i + batch_size]["text"] + unique_toks = unique_toks.union(set(" ".join(sample).split(" "))) + yield sample -tokenizer = AutoTokenizer.from_pretrained("gpt2") -assert tokenizer.is_fast -new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) -print("✅ Trained tokenizer with len ", len(new_tokenizer)) +unique_toks = set() -new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") -print(f"✅ Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") +if args.extend_vocab: + tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + assert tokenizer.is_fast + new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) + print("✅ Trained tokenizer with len ", len(new_tokenizer)) + added = tokenizer.add_tokens([tok for tok in new_tokenizer.vocab.keys()]) + print(f"Overlap with previous vocab: {args.vocab_size - added}") + tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") +elif args.replace_with_overlap: + # This setting is not really working properly: we need to save the new_tokenizer, but add somehow token that can be used at inference which I don't know how to do (so that it is also get used at tokenization step properly + tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + + assert tokenizer.is_fast + new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) + print("✅ Trained tokenizer with len ", len(new_tokenizer)) + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") +else: + tokenizer = AutoTokenizer.from_pretrained('gpt2') + assert tokenizer.is_fast + new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) + print("Unique toks, ", len(unique_toks)) + print("✅ Trained tokenizer with len ", len(new_tokenizer)) + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_scratch") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_scratch") diff --git a/scripts/lang_adapt/train_tokenizer_scratch.sh b/scripts/lang_adapt/train_tokenizer_scratch.sh new file mode 100644 index 0000000..354efbb --- /dev/null +++ b/scripts/lang_adapt/train_tokenizer_scratch.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +lng=$1 +sample_size=$2 +vocab_size=$3 +source $bs_dir/multilingual-modeling/scripts/env/bin/activate +python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size + diff --git a/scripts/lang_adapt/train_tokenizer_update.sh b/scripts/lang_adapt/train_tokenizer_update.sh new file mode 100644 index 0000000..4c08242 --- /dev/null +++ b/scripts/lang_adapt/train_tokenizer_update.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +lng=$1 +sample_size=$2 +vocab_size=$3 +source $bs_dir/multilingual-modeling/scripts/env/bin/activate +python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size --extend_vocab + From f3a165e19760286ffba0e875a708b293c62b8a59 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:36:30 +0200 Subject: [PATCH 016/142] added xnli zero shot training and eval scripts --- scripts/eval_xnli/run_eval_xnli_zero_shot.sh | 67 ++++++++++++++++++++ scripts/eval_xnli/train_xnli_en.sh | 66 +++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 scripts/eval_xnli/run_eval_xnli_zero_shot.sh create mode 100644 scripts/eval_xnli/train_xnli_en.sh diff --git a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh new file mode 100644 index 0000000..cfd8964 --- /dev/null +++ b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#SBATCH -p gpu +#SBATCH --gres="gpu:1" +#SBATCH --mem=100g + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +# XNLI (Cross-Lingual and Supervised Setting) + +LANG=$1 +data_sample=$2 +vocabsize=$3 +adapter_reduction_factor=$4 + +ch=118500 + + +adapter_config="pfeiffer+inv" +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name +TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +CACHE_DIR="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" +data_tok_dir=${data_dir}/lng_tok + +MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +LR=1e-5 + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_${LANG}" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +outdir=$MODEL_DIR/xnli_eval_zero_shot +# evaluate zero-shot training +python adapters_xnli_de_vn.py \ +$XNLI_ZH_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_DIR \ +--original_model $ORIGINAL_MODEL \ +--tokenizer $TOKENIZER_DIR \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies "task_adapters" \ +--zero_shot &> $XNLI_ZH_DIR/$( basename $data_dir )-$( basename $MODEL_DIR )_eval.log + + + + +#Remove `--zero_shot` for supervised finetuning setting. + +### Zero-shot Prompt-based Setting + +#See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). diff --git a/scripts/eval_xnli/train_xnli_en.sh b/scripts/eval_xnli/train_xnli_en.sh new file mode 100644 index 0000000..8a9445c --- /dev/null +++ b/scripts/eval_xnli/train_xnli_en.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J run_clm_madx + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +# XNLI (Cross-Lingual and Supervised Setting) + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +LANG=$1 +data_sample=$2 +vocabsize=$3 +adapter_reduction_factor=$4 + +ch=118500 + + +adapter_config="pfeiffer+inv" +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name +TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +CACHE_DIR="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" +data_tok_dir=${data_dir}/lng_tok + +MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +OUTPUT_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +LR=1e-5 + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_de" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +mkdir -p $OUTPUT_DIR +python adapters_xnli_de_vn.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_DIR \ +--original_model $ORIGINAL_MODEL \ +--tokenizer $TOKENIZER_DIR \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies "task_adapters" \ +--zero_shot &> $OUTPUT_DIR/train.log + From 5ed40b9f8663703161512a2cadce51dba7f1f986 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:37:08 +0200 Subject: [PATCH 017/142] added xnli zero shot training and eval scripts --- scripts/eval_xnli/adapters_xnli_de_vn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index 45ae562..317b847 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -135,7 +135,7 @@ def compute_metrics(eval_pred): def load_model(args, inference=False): # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if not args.original_model == args.pretrained_model: + if not args.original_model == args.pretrained_model and not args.zero_shot: wte = torch.load(f'{args.pretrained_model}/embedding.pt') wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') From 2497ba3e5d78c7003fef867d2101bc9386c5eee4 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:41:10 +0200 Subject: [PATCH 018/142] merged with current version --- scripts/eval_xnli/adapters_xnli_de_vn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index f10e10a..880f3b5 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -135,7 +135,7 @@ def compute_metrics(eval_pred): def load_model(args, inference=False): # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if not args.original_model == args.pretrained_model and not args.zero_shot: + if not args.original_model == args.pretrained_model and not args.cross_lingual: wte = torch.load(f'{args.pretrained_model}/embedding.pt') wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') @@ -145,6 +145,7 @@ def load_model(args, inference=False): cache_dir=args.cache_dir) if inference or not args.cross_lingual: + # need to load embedding/adapters from the model adapted to the new language causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) if not args.original_model == args.pretrained_model: From f35b984848f6360098a40d80fdd9fe6a6acc6f2a Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 16:49:24 +0200 Subject: [PATCH 019/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 93 +++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 scripts/lang_adapt/compute_tok_overlap.py diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py new file mode 100644 index 0000000..23bd70c --- /dev/null +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -0,0 +1,93 @@ +import sys +import json +import datasets +from datasets import load_dataset +from transformers import AutoTokenizer +import numpy as np +from collections import defaultdict +import math +import argparse +import matplotlib.pyplot as plt + +def get_en_tokenizer(): + en_tok = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + return en_tok + +def getdata(lng): + flores_path="/tmp-network/user/vnikouli/Projects/NLE-NMT/data/test_sets/" + with open(f'{flores_path}/FLORES-valid.{lng}') as f: + dataset = f.readlines() + return dataset + +def gettokens(tok, dataset): + from collections import defaultdict + seq_lengths = [] + toks_occ = defaultdict(int) + for i,l in enumerate(dataset): + toks = tok.tokenize(l.strip()) + seq_lengths.append(len(toks)) + toks_occ.update({t:toks_occ[t]+1 for t in toks }) + return np.array(seq_lengths), toks_occ + + + +def plot_histogram(tokoccs, name, ax, nb_bins): + ax.hist(tokoccs, nb_bins, histtype='bar', label=name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--lang', type=str, required=True) + parser.add_argument('--tokenizers', type=str, nargs='+', + help='an integer for the accumulator') + parser.add_argument('--plot_name', type=str, default="stats_plot") + args = parser.parse_args() + lng = args.lang + tokenizers = args.tokenizers + vocabs = {} + dataset=getdata(lng) + en_dataset = getdata("en") + seq_lengths = {} + tok_occs = {} + en_tok = get_en_tokenizer() + sl, to = gettokens(en_tok, en_dataset) + seq_lengths['en'] = sl + + for t in tokenizers: + tok = AutoTokenizer.from_pretrained(t) + sl, to = gettokens(tok, dataset) + seq_lengths[t] = sl + tok_occs[t] = to + with open(f'{t}/vocab.json') as jsonFile: + vocab = json.load(jsonFile) + vocabs[t] = set(vocab.keys()) + + + print("Print tokenization stats") + print("===============================") + fig, ax = plt.subplots(1, 4, figsize=(40, 10)) + for t in tokenizers: + print(f'Tokenizer {t}, avg tokenized seq length: {np.mean(seq_lengths[t])} (shorter sequences are better)') + #we want to decompose sentence in {lng} in approximately the same nb of tokens as in English hoping that it will favour knowledge transfer + x = seq_lengths[t]/seq_lengths["en"] + print(f'Tokenizer {t}, avg ratio with En tokenized sentence length: {np.mean(x)}+/- {np.std(x)}') + baseline_overlap = vocabs[t].intersection(set(en_tok.vocab.keys())) + print(f"Overlap with original tokenizer vocab : {len(baseline_overlap)} ") + print(f"Overlap between new tokenizer vocab and obtained tokenswith original tokenizer vocab : {len(baseline_overlap)} ") + + + + print("Do plotting") + fig, ax = plt.subplots(1, 4, figsize=(40, 10)) + ax[0].set_title("Token occ distribution") + plot_histogram([[math.log(v) for v in tok_occs[t].values()] for t in tokenizers], tokenizers, ax[0], 10) + ax[1].set_title("Seq length distribution") + plot_histogram([seq_lengths[t] for t in tokenizers], tokenizers, ax[1], 10) + ax[2].set_title("Diff wtih en seq length distribution") + plot_histogram([seq_lengths[t]/seq_lengths["en"] for t in tokenizers], tokenizers, ax[2], 10) + ax[3].set_title("Tok length distribution") + plot_histogram([[len(v) for v in vocabs[t] for i in range(tok_occs[t][v])] for t in tokenizers], tokenizers, ax[3], 10) + ax[1].legend() + fig.savefig(f"{args.plot_name}.png") + + From dbf3f0e6e71acf104bf74741db4ce3a303ded433 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 16:53:07 +0200 Subject: [PATCH 020/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 30 +++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py index 23bd70c..533d1bb 100644 --- a/scripts/lang_adapt/compute_tok_overlap.py +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -39,8 +39,8 @@ def plot_histogram(tokoccs, name, ax, nb_bins): parser = argparse.ArgumentParser() parser.add_argument('--lang', type=str, required=True) parser.add_argument('--tokenizers', type=str, nargs='+', - help='an integer for the accumulator') - parser.add_argument('--plot_name', type=str, default="stats_plot") + help='list of the tokenizers for which you want to get statstics') + parser.add_argument('--plot_name', type=str, default=None, help="If set generate plots containing tokens distribution across different axes (frequency, length, etc)") args = parser.parse_args() lng = args.lang tokenizers = args.tokenizers @@ -76,18 +76,18 @@ def plot_histogram(tokoccs, name, ax, nb_bins): print(f"Overlap between new tokenizer vocab and obtained tokenswith original tokenizer vocab : {len(baseline_overlap)} ") - - print("Do plotting") - fig, ax = plt.subplots(1, 4, figsize=(40, 10)) - ax[0].set_title("Token occ distribution") - plot_histogram([[math.log(v) for v in tok_occs[t].values()] for t in tokenizers], tokenizers, ax[0], 10) - ax[1].set_title("Seq length distribution") - plot_histogram([seq_lengths[t] for t in tokenizers], tokenizers, ax[1], 10) - ax[2].set_title("Diff wtih en seq length distribution") - plot_histogram([seq_lengths[t]/seq_lengths["en"] for t in tokenizers], tokenizers, ax[2], 10) - ax[3].set_title("Tok length distribution") - plot_histogram([[len(v) for v in vocabs[t] for i in range(tok_occs[t][v])] for t in tokenizers], tokenizers, ax[3], 10) - ax[1].legend() - fig.savefig(f"{args.plot_name}.png") + if args.plot_name: + print("Do plotting") + fig, ax = plt.subplots(1, 4, figsize=(40, 10)) + ax[0].set_title("Token occ distribution") + plot_histogram([[math.log(v) for v in tok_occs[t].values()] for t in tokenizers], tokenizers, ax[0], 10) + ax[1].set_title("Seq length distribution") + plot_histogram([seq_lengths[t] for t in tokenizers], tokenizers, ax[1], 10) + ax[2].set_title("Diff wtih en seq length distribution") + plot_histogram([seq_lengths[t]/seq_lengths["en"] for t in tokenizers], tokenizers, ax[2], 10) + ax[3].set_title("Tok length distribution") + plot_histogram([[len(v) for v in vocabs[t] for i in range(tok_occs[t][v])] for t in tokenizers], tokenizers, ax[3], 10) + ax[1].legend() + fig.savefig(f"{args.plot_name}.png") From 639c4da99cf22284484937381afc9d63bfc2d9b2 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 16:58:12 +0200 Subject: [PATCH 021/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py index 533d1bb..a31e504 100644 --- a/scripts/lang_adapt/compute_tok_overlap.py +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -73,7 +73,8 @@ def plot_histogram(tokoccs, name, ax, nb_bins): print(f'Tokenizer {t}, avg ratio with En tokenized sentence length: {np.mean(x)}+/- {np.std(x)}') baseline_overlap = vocabs[t].intersection(set(en_tok.vocab.keys())) print(f"Overlap with original tokenizer vocab : {len(baseline_overlap)} ") - print(f"Overlap between new tokenizer vocab and obtained tokenswith original tokenizer vocab : {len(baseline_overlap)} ") + overlap_vocab_toks = vocabs[t].intersection(set(tok_occs[t].keys())) + print(f"Which portion of new tokenizer was used? : {len(overlap_vocab_toks)}, represents {1.0*len(overlap_vocab_toks}/len(vocabs[t])}% of learnt vocab ") if args.plot_name: From 685f402de159a4a1216f798b40d69930b9523788 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 17:03:27 +0200 Subject: [PATCH 022/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py index a31e504..8f95394 100644 --- a/scripts/lang_adapt/compute_tok_overlap.py +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -74,7 +74,7 @@ def plot_histogram(tokoccs, name, ax, nb_bins): baseline_overlap = vocabs[t].intersection(set(en_tok.vocab.keys())) print(f"Overlap with original tokenizer vocab : {len(baseline_overlap)} ") overlap_vocab_toks = vocabs[t].intersection(set(tok_occs[t].keys())) - print(f"Which portion of new tokenizer was used? : {len(overlap_vocab_toks)}, represents {1.0*len(overlap_vocab_toks}/len(vocabs[t])}% of learnt vocab ") + print(f"Which portion of new tokenizer was used? : {len(overlap_vocab_toks)}, represents {100.0*len(overlap_vocab_toks)/len(vocabs[t])}% of learnt vocab ") if args.plot_name: From 04f9fabf55ce82ee90c36643026da9609e0eb43f Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 17:41:45 +0200 Subject: [PATCH 023/142] fixed tokenizer training with unk token --- scripts/lang_adapt/tokenized4clm_sampled.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 775815e..71e1fea 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -75,8 +75,8 @@ def batch_iterator(): print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") elif args.replace_with_overlap: # This setting is not really working properly: we need to save the new_tokenizer, but add somehow token that can be used at inference which I don't know how to do (so that it is also get used at tokenization step properly - tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') - + tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/', unk_token="") + assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("✅ Trained tokenizer with len ", len(new_tokenizer)) From 04f893f4abb365c0bceab7ab261a951c4db9acc2 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Thu, 12 May 2022 09:09:34 -0400 Subject: [PATCH 024/142] add num_classes arg to model init --- scripts/eval_xnli/adapters_eval.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/eval_xnli/adapters_eval.py b/scripts/eval_xnli/adapters_eval.py index 5379845..5334cb1 100644 --- a/scripts/eval_xnli/adapters_eval.py +++ b/scripts/eval_xnli/adapters_eval.py @@ -190,18 +190,22 @@ def compute_metrics(eval_preds): def load_model(args, inference=False): # Hack for loading wte module not needed here, since using a causal language model class + optional_kwargs = {} + if args.dataset == "xnli": + optional_kwargs = {"num_labels": 3} if args.zero_shot and not inference: + # only pass in num_labels if using a seq. classification model model = model_class_mapping[args.dataset].from_pretrained(args.pretrained_model, - num_labels=3 if args.dataset == "xnli" else None, pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir, - revision=args.revision) + revision=args.revision, + **optional_kwargs) else: model = model_class_mapping[args.dataset].from_pretrained(args.pretrained_model, - num_labels=3 if args.dataset == "xnli" else None, pad_token_id=tokenizer.pad_token_id, cache_dir=args.cache_dir, - revision=args.revision) + revision=args.revision, + **optional_kwargs) if not args.zero_shot or (args.zero_shot and inference): # if not zero shot, that means that we need to replace the embedding layers during training # we also need to replace embedding layers during inference @@ -287,7 +291,7 @@ def load_model(args, inference=False): assert len(evaluation_dirs) > 0 logger.info(f"Found {len(evaluation_dirs)} checkpoints") - # load the last checkpoint. TODO: make sure this still should be done even if no madx adapter is used + # load the last checkpoint. args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") From 448035efc52d1c0fcc051a9c431631d563f09796 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 12 May 2022 21:24:38 -0400 Subject: [PATCH 025/142] rename pretrained_model to adapted_model --- scripts/eval_xnli/adapters_xnli_de_vn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index 880f3b5..3e29ddd 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -27,7 +27,7 @@ parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") +parser.add_argument("--adapted_model") parser.add_argument("--original_model") parser.add_argument("--tokenizer") parser.add_argument("--do_train", default=False, action="store_true") @@ -46,8 +46,8 @@ args.do_predict = True if args.original_model is None: - # here: because the wpe is not saved, pretrained_model is the original bigsciece model - args.original_model = args.pretrained_model + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model print("Arguments: ========") print(args) @@ -135,9 +135,9 @@ def compute_metrics(eval_pred): def load_model(args, inference=False): # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if not args.original_model == args.pretrained_model and not args.cross_lingual: - wte = torch.load(f'{args.pretrained_model}/embedding.pt') - wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') + if not args.original_model == args.adapted_model and not args.cross_lingual: + wte = torch.load(f'{args.adapted_model}/embedding.pt') + wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') model = GPT2ForSequenceClassification.from_pretrained(args.original_model, num_labels=3, @@ -148,7 +148,7 @@ def load_model(args, inference=False): # need to load embedding/adapters from the model adapted to the new language causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) - if not args.original_model == args.pretrained_model: + if not args.original_model == args.adapted_model: causal_lm_model.transformer.wte = wte causal_lm_model.transformer.wpe = wpe if args.madx_lang_adapter: From 7a8f899fee7fa8a265ac8193c1de0236b7f989b8 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 12 May 2022 21:28:24 -0400 Subject: [PATCH 026/142] use updated eval_xnli/adapters_xnli_de_vn.py --- scripts/eval_xnli/adapters_xnli_de.py | 137 ++++++++++++-------------- 1 file changed, 61 insertions(+), 76 deletions(-) diff --git a/scripts/eval_xnli/adapters_xnli_de.py b/scripts/eval_xnli/adapters_xnli_de.py index 46140aa..3e29ddd 100644 --- a/scripts/eval_xnli/adapters_xnli_de.py +++ b/scripts/eval_xnli/adapters_xnli_de.py @@ -27,18 +27,18 @@ parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") +parser.add_argument("--adapted_model") parser.add_argument("--original_model") parser.add_argument("--tokenizer") parser.add_argument("--do_train", default=False, action="store_true") parser.add_argument("--do_eval_after_train", default=False, action="store_true") parser.add_argument("--do_predict", default=False, action="store_true") parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") +parser.add_argument("--cross_lingual", default=False, action="store_true") finetune_strategies = ["whole", "lang_adapters", "task_adapters"] parser.add_argument("--madx_lang_adapter") -parser.add_argument("--adapter_lang_name", required=True) +#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) args = parser.parse_args() @@ -46,21 +46,20 @@ args.do_predict = True if args.original_model is None: - # here: because the wpe is not saved, pretrained_model is the original bigsciece model - args.original_model = args.pretrained_model + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model print("Arguments: ========") print(args) # load dataset -if args.zero_shot: +if args.cross_lingual: print("0️⃣ 0-Shot") # 0-shot: use english as train and validation xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) assert args.lang != "en" - train_dataset = xnli_en_dataset['train'] val_dataset = xnli_en_dataset['validation'] test_dataset = xnli_dataset['test'] @@ -76,7 +75,7 @@ # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: +if args.cross_lingual: en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer en_tokenizer.pad_token = en_tokenizer.eos_token @@ -88,21 +87,23 @@ def en_tokenize_function(examples): logger.info("Tokenizing the dataset...") -if args.zero_shot: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) +if args.do_train: + if args.cross_lingual: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + + + small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) + small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) + logger.info(full_train_dataset[0]) + logger.info(full_train_dataset[100]) full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - from datasets import load_metric metric = load_metric("xnli") @@ -132,51 +133,40 @@ def compute_metrics(eval_pred): ) def load_model(args, inference=False): - # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if args.zero_shot and not inference: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - else: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - if not args.zero_shot or (args.zero_shot and inference): - # if not zero shot, that means that we need to replace the embedding layers during training - # we also need to replace embedding layers during inference - causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + if not args.original_model == args.adapted_model and not args.cross_lingual: + wte = torch.load(f'{args.adapted_model}/embedding.pt') + wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') + + model = GPT2ForSequenceClassification.from_pretrained(args.original_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) - # change the embedding layer of the original big science model - # by loading the adapters (which has saved lm_head) + if inference or not args.cross_lingual: + # need to load embedding/adapters from the model adapted to the new language + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.adapted_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe if args.madx_lang_adapter: - causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - - # model has original bigscience embedding so replace it. - model.resize_token_embeddings(len(tokenizer)) - model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) if not inference: - if not args.zero_shot: - if args.madx_lang_adapter: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv", - load_as=args.adapter_lang_name) - if args.finetune_strategies == "whole": - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "lang_adapters": - model.train_adapter([args.adapter_lang_name]) - elif args.finetune_strategies == "task_adapters": - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - else: - raise ValueError("Lack configuration") - + #if not args.cross_lingual: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + print("🔥 ==================== Training: ==================== 🔥") for name, param in model.named_parameters(): if not param.requires_grad: @@ -185,24 +175,19 @@ def load_model(args, inference=False): print(f"🚀 Trainable layer '{name}'") print(model) else: - print("🔥 ==================== Inference: ==================== 🔥") - if args.finetune_strategies == "lang_adapters": - assert args.pretrained_adapters_dir - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "task_adapters": - if args.madx_lang_adapter: - assert args.pretrained_adapters_dir - adapter_name = model.load_adapter(args.madx_lang_adapter) - model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - else: - # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") - - # for TGT -> TGT supervised finetuning setting, change adapter_name - adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") - model.set_active_adapters(adapter_name) + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) print(model) return model @@ -241,4 +226,4 @@ def load_model(args, inference=False): compute_metrics=compute_metrics ) - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file + print("Evaluate on Test:", trainer.evaluate()) From ac86e1c075779a3bb13e42fb54c58e239063999f Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 12 May 2022 21:34:47 -0400 Subject: [PATCH 027/142] update XNLI --- scripts/eval_xnli/README.md | 5 ++++- scripts/eval_xnli/run_eval_xnli_zero_shot.sh | 2 +- .../eval_xnli/{train_xnli_en.sh => train_xnli_zero_shot.sh} | 0 3 files changed, 5 insertions(+), 2 deletions(-) rename scripts/eval_xnli/{train_xnli_en.sh => train_xnli_zero_shot.sh} (100%) diff --git a/scripts/eval_xnli/README.md b/scripts/eval_xnli/README.md index 17fc051..f7c1195 100644 --- a/scripts/eval_xnli/README.md +++ b/scripts/eval_xnli/README.md @@ -30,13 +30,16 @@ $OUTPUT_DIR \ --do_train \ --do_eval_after_train \ --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ ---adapter_lang_name "xnli-de" \ --finetune_strategies $FT_STRATEGIES \ --zero_shot ``` Remove `--zero_shot` for supervised finetuning setting. +Notes: +- `adapters_xnli_de_vn.py` is Vassilina's forked of `adapters_xnli_de.py`. +- `train_xnli_zero_shot.sh` is the batch script for XNLI training, and `run_eval_xnli_zero_shot.sh` is for evaluating trained XNLI task adapters. + ### Zero-shot Prompt-based Setting See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). \ No newline at end of file diff --git a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh index cfd8964..855cde9 100644 --- a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh +++ b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh @@ -30,7 +30,7 @@ data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${da data_tok_dir=${data_dir}/lng_tok MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" -XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full # output directory LR=1e-5 # language adapters checkpoint folder diff --git a/scripts/eval_xnli/train_xnli_en.sh b/scripts/eval_xnli/train_xnli_zero_shot.sh similarity index 100% rename from scripts/eval_xnli/train_xnli_en.sh rename to scripts/eval_xnli/train_xnli_zero_shot.sh From b31f805f4ef6543e59d223dd6692885469cb8750 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Fri, 13 May 2022 01:48:22 -0400 Subject: [PATCH 028/142] add seq2seq training and fix compute_metrics --- scripts/eval_xnli/adapters_eval.py | 77 ++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/scripts/eval_xnli/adapters_eval.py b/scripts/eval_xnli/adapters_eval.py index 5334cb1..5284539 100644 --- a/scripts/eval_xnli/adapters_eval.py +++ b/scripts/eval_xnli/adapters_eval.py @@ -8,8 +8,10 @@ import torch import numpy as np -from transformers import TrainingArguments, AdapterTrainer +import nltk +from transformers import TrainingArguments, AdapterTrainer, Seq2SeqAdapterTrainer, Seq2SeqTrainingArguments from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2ForSequenceClassification, AutoModelForCausalLM +from transformers import DataCollatorForSeq2Seq logger.remove() @@ -47,13 +49,25 @@ parser.add_argument("--deepspeed", required=False) -# mapping of task to model_class +# mapping of tasks to model/trainer classes model_class_mapping = {"xnli": GPT2ForSequenceClassification, "xlsum": GPT2LMHeadModel} +trainer_class_mapping = {"xnli": AdapterTrainer, "xlsum": Seq2SeqAdapterTrainer} +trainer_args_mapping = {"xnli": TrainingArguments, "xlsum": Seq2SeqTrainingArguments} + args = parser.parse_args() if args.do_eval_after_train: args.do_predict = True +# additional args to pass to the model init. task-dependent +optional_model_kwargs = {} +optional_trainer_args = {} +if args.dataset == "xnli": + optional_model_kwargs = {"num_labels": 3} +elif args.dataset == "xlsum": + optional_trainer_args = {"generation_max_length": 128, "predict_with_generate":True} + + if args.local_rank: torch.cuda.set_device(args.local_rank) @@ -97,10 +111,10 @@ def tokenize_function(examples): elif args.dataset == "xlsum": def tokenize_function(example): - inputs = tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) + inputs = tokenizer(f'summarize this article: {example["text"]}', max_length=96, padding="max_length", truncation=True) with tokenizer.as_target_tokenizer(): - summaries = tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + summaries = tokenizer(f'{example["summary"]}', max_length=96, padding="max_length", truncation=True) inputs["labels"] = summaries["input_ids"] @@ -116,10 +130,10 @@ def en_tokenize_function(examples): elif args.dataset == "xlsum": def en_tokenize_function(example): - inputs = en_tokenizer(f'summarize this article: {example["text"]}', max_length=256, padding="max_length", truncation=True) + inputs = en_tokenizer(f'summarize this article: {example["text"]}', max_length=96, padding="max_length", truncation=True) with en_tokenizer.as_target_tokenizer(): - summaries = en_tokenizer(f'{example["summary"]}', max_length=256, padding="max_length", truncation=True) + summaries = en_tokenizer(f'{example["summary"]}', max_length=96, padding="max_length", truncation=True) inputs["labels"] = summaries["input_ids"] @@ -158,14 +172,25 @@ def compute_metrics(eval_pred): def compute_metrics(eval_preds): preds, labels = eval_preds + + preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - return metric(preds, labels) + preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] + + result = metric.compute(predictions=preds, references=labels) + # TODO: need to confirm these are the right rouge values to report. Can report more ROUGE metrics if needed. + result = {key: value.mid.fmeasure * 100 for key, value in result.items()} + + return {k: round(v, 4) for k, v in result.items()} else: raise ValueError("Unknown dataset provided") -training_args = TrainingArguments( +training_args = trainer_args_mapping[args.dataset]( output_dir=args.output_dir, overwrite_output_dir=True, do_train=True, @@ -184,28 +209,26 @@ def compute_metrics(eval_preds): logging_dir=f"{args.output_dir}/logs", load_best_model_at_end=True, deepspeed=args.deepspeed, + **optional_trainer_args, ) # TODO: double-check the adapter loading logic here def load_model(args, inference=False): # Hack for loading wte module not needed here, since using a causal language model class - optional_kwargs = {} - if args.dataset == "xnli": - optional_kwargs = {"num_labels": 3} if args.zero_shot and not inference: # only pass in num_labels if using a seq. classification model model = model_class_mapping[args.dataset].from_pretrained(args.pretrained_model, pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir, revision=args.revision, - **optional_kwargs) + **optional_model_kwargs) else: model = model_class_mapping[args.dataset].from_pretrained(args.pretrained_model, pad_token_id=tokenizer.pad_token_id, cache_dir=args.cache_dir, revision=args.revision, - **optional_kwargs) + **optional_model_kwargs) if not args.zero_shot or (args.zero_shot and inference): # if not zero shot, that means that we need to replace the embedding layers during training # we also need to replace embedding layers during inference @@ -269,12 +292,25 @@ def load_model(args, inference=False): if args.do_train: logger.info("Starting training...") model = load_model(args) - trainer = AdapterTrainer( + + + # only use seq2seq collator if doing seq2seq task + if args.dataset == "xlsum": + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=-100, + ) + + + trainer = trainer_class_mapping[args.dataset]( model=model, args=training_args, train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == "xlsum" else {}, ) trainer.train() @@ -298,11 +334,22 @@ def load_model(args, inference=False): model = load_model(args, inference=True) training_args.report_to = list() - trainer = AdapterTrainer( + if args.dataset == "xlsum": + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=-100, + pad_to_multiple_of=8 if training_args.fp16 else None, + ) + + trainer = trainer_class_mapping[args.dataset]( model=model, args=training_args, eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == "xlsum" else {} + ) print("Evaluating on test set...", trainer.evaluate()) From 3b5922275038b024bb045a011f8dbf21648d63b6 Mon Sep 17 00:00:00 2001 From: Zheng Xin Yong Date: Mon, 13 Sep 2021 18:08:28 -0400 Subject: [PATCH 029/142] exp-001: finetune gpt-2 model with new tokenizer on fr --- experiments/README.md | 2 +- experiments/exp-001/download_oscar_fr.py | 9 +++++++ experiments/exp-001/download_oscar_fr.sh | 30 +++++++++++++++++++++ experiments/exp-001/train_tokenizer_gpt2.py | 20 ++++++++++++++ 4 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 experiments/exp-001/download_oscar_fr.py create mode 100644 experiments/exp-001/download_oscar_fr.sh create mode 100644 experiments/exp-001/train_tokenizer_gpt2.py diff --git a/experiments/README.md b/experiments/README.md index daeb6d8..513efc3 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -1,5 +1,5 @@ # Current Experiments - +- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr`. # Carbon Tracking Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) diff --git a/experiments/exp-001/download_oscar_fr.py b/experiments/exp-001/download_oscar_fr.py new file mode 100644 index 0000000..fd04779 --- /dev/null +++ b/experiments/exp-001/download_oscar_fr.py @@ -0,0 +1,9 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") +print("Done") \ No newline at end of file diff --git a/experiments/exp-001/download_oscar_fr.sh b/experiments/exp-001/download_oscar_fr.sh new file mode 100644 index 0000000..a558c9e --- /dev/null +++ b/experiments/exp-001/download_oscar_fr.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-031-download_oscar_fr + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.err + +# Set up the environment by loading modules +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +python3 $FP_BIGS/scripts/exp-001/download_oscar_fr.py \ No newline at end of file diff --git a/experiments/exp-001/train_tokenizer_gpt2.py b/experiments/exp-001/train_tokenizer_gpt2.py new file mode 100644 index 0000000..d8fe237 --- /dev/null +++ b/experiments/exp-001/train_tokenizer_gpt2.py @@ -0,0 +1,20 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") + +def batch_iterator(): + batch_size = 1000 + for i in range(0, len(dataset), batch_size): + yield dataset['train'][i : i + batch_size]["text"] + +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +assert tokenizer.is_fast +new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) +new_tokenizer.save_pretrained(f"{os.getenv('FP_BIGS')}/data/processed/exp-001/oscar-fr-tokenizer") \ No newline at end of file From 33352099cf82c0ffb2d2a171ef7777e329669ed3 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 13 Sep 2021 18:37:37 -0400 Subject: [PATCH 030/142] add run_clm.py --- experiments/exp-001/README.md | 22 ++ experiments/exp-001/run_clm.py | 544 +++++++++++++++++++++++++++++++++ experiments/exp-001/run_clm.sh | 41 +++ 3 files changed, 607 insertions(+) create mode 100644 experiments/exp-001/README.md create mode 100644 experiments/exp-001/run_clm.py create mode 100644 experiments/exp-001/run_clm.sh diff --git a/experiments/exp-001/README.md b/experiments/exp-001/README.md new file mode 100644 index 0000000..9d89cb5 --- /dev/null +++ b/experiments/exp-001/README.md @@ -0,0 +1,22 @@ +# Decisions + +**Dataset**: HF's OSCAR unshuffled_deduplicated_fr + +**Tokenizer**: byte-level Byte-pair encoding tokenizer (same as GPT-2). Training is identical to the section "Using an existing tokenizer" in huggingface's tokenizer_training [tutorial](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) +- train the GPT-2 tokenizer with the exact same algorithms and parameters as an existing one. +- vocab_size: 50,257 (same as original GPT-2) + +**Model Finetuning**: Use [HF's code](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) `run_clm.py` to finetune GPT-2 with the new tokenizer on oscar's fr dataset. + +Freeze all the layers of GPT-2 except the word embedding layer `wte` and the positional embedding layer `wpe` by adding the following snippet of codes to `run_clm.py`. +``` +... +for name, param in model.named_parameters(): + if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): + print(f"🥶 Freeze layer '{name}'") + param.requires_grad = False + else: + param.requires_grad = True +... +``` + diff --git a/experiments/exp-001/run_clm.py b/experiments/exp-001/run_clm.py new file mode 100644 index 0000000..209c66a --- /dev/null +++ b/experiments/exp-001/run_clm.py @@ -0,0 +1,544 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=causal-lm +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import torch +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + model.resize_token_embeddings(len(tokenizer)) + for name, param in model.named_parameters(): + if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): + print(f"🥶 Freeze layer '{name}'") + param.requires_grad = False + else: + param.requires_grad = True + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + print("Sanity check: loaded tokenized_datasets") + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + print("Sanity check: saved tokenized_datasets") + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + print("Sanity check: loaded lm_datasets") + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + print("Sanity check: saved lm_datasets") + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + if training_args.push_to_hub: + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + trainer.push_to_hub(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/experiments/exp-001/run_clm.sh b/experiments/exp-001/run_clm.sh new file mode 100644 index 0000000..f066b58 --- /dev/null +++ b/experiments/exp-001/run_clm.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-001-run_clm + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" +cache_dir="${FP_BIGS}/data/external/oscar_fr" +output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2" + +python $FP_BIGS/scripts/exp-001/run_clm.py \ + --model_name_or_path gpt2 \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_fr \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ No newline at end of file From b58c999a2baa61ecde6b01565bbfc129794d1450 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 15 Sep 2021 15:50:29 -0400 Subject: [PATCH 031/142] update run_clm and run_clm_no_tok --- experiments/exp-001/run_clm.sh | 8 +++-- experiments/exp-001/run_clm_no_tok.sh | 42 +++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 experiments/exp-001/run_clm_no_tok.sh diff --git a/experiments/exp-001/run_clm.sh b/experiments/exp-001/run_clm.sh index f066b58..35bd65a 100644 --- a/experiments/exp-001/run_clm.sh +++ b/experiments/exp-001/run_clm.sh @@ -1,7 +1,7 @@ #!/bin/bash # Request half an hour of runtime: -#SBATCH --time=3-23:59:00 +#SBATCH --time=5-23:59:00 # Ask for the GPU partition and 1 GPU #SBATCH --partition=3090-gcondo --gres=gpu:1 @@ -9,7 +9,7 @@ # Default resources are 1 core with 2.8GB of memory. # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g +#SBATCH --mem=150g # Specify a job name: #SBATCH -J exp-001-run_clm @@ -38,4 +38,6 @@ python $FP_BIGS/scripts/exp-001/run_clm.py \ --dataset_config_name unshuffled_deduplicated_fr \ --do_train \ --do_eval \ - --output_dir $output_dir \ No newline at end of file + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ No newline at end of file diff --git a/experiments/exp-001/run_clm_no_tok.sh b/experiments/exp-001/run_clm_no_tok.sh new file mode 100644 index 0000000..10aafa7 --- /dev/null +++ b/experiments/exp-001/run_clm_no_tok.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=5-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=150g + +# Specify a job name: +#SBATCH -J exp-001-run_clm_no_tok + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" +cache_dir="${FP_BIGS}/data/external/oscar_fr" +output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-no-tok" + +python $FP_BIGS/scripts/exp-001/run_clm.py \ + --model_name_or_path gpt2 \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_fr \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ No newline at end of file From 2dd82adda4b7be382bb9b0c747a7a66142768288 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 16 Sep 2021 11:59:10 -0400 Subject: [PATCH 032/142] reduce per_device_{train, eval}_batch_size and increase {gradient, eval}_accumulation_steps --- experiments/exp-001/run_clm.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/experiments/exp-001/run_clm.py b/experiments/exp-001/run_clm.py index 209c66a..0529c26 100644 --- a/experiments/exp-001/run_clm.py +++ b/experiments/exp-001/run_clm.py @@ -473,6 +473,19 @@ def group_texts(examples): if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + import gc + del tokenized_datasets + gc.collect() + + # original training_args.per_device_train_batch_size = 8 + training_args.per_device_train_batch_size = 4 + training_args.gradient_accumulation_steps = 2 + + # original training_args.per_device_eval_batch_size = 8 + training_args.per_device_eval_batch_size = 4 + training_args.eval_accumulation_steps = 2 + + # Initialize our Trainer trainer = Trainer( model=model, @@ -491,6 +504,7 @@ def group_texts(examples): checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint + print("checkpoint:", checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload From 27d56756f991d7d40a2eee7079327bfb6cadb0fe Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 6 Oct 2021 16:27:11 -0400 Subject: [PATCH 033/142] update exp-001 exp-002 --- experiments/exp-001/README.md | 13 +--- experiments/exp-001/run_clm.py | 11 +-- experiments/exp-001/run_clm.sh | 22 ++++-- experiments/exp-001/run_clm_no_tok.sh | 13 +++- experiments/exp-002/eval_flue_cls.py | 99 +++++++++++++++++++++++++++ experiments/exp-002/eval_flue_cls.sh | 43 ++++++++++++ experiments/exp-002/eval_flue_paws.py | 95 +++++++++++++++++++++++++ experiments/exp-002/eval_flue_paws.sh | 43 ++++++++++++ experiments/exp-002/eval_flue_xnli.py | 95 +++++++++++++++++++++++++ experiments/exp-002/eval_flue_xnli.sh | 43 ++++++++++++ 10 files changed, 448 insertions(+), 29 deletions(-) create mode 100644 experiments/exp-002/eval_flue_cls.py create mode 100644 experiments/exp-002/eval_flue_cls.sh create mode 100644 experiments/exp-002/eval_flue_paws.py create mode 100644 experiments/exp-002/eval_flue_paws.sh create mode 100644 experiments/exp-002/eval_flue_xnli.py create mode 100644 experiments/exp-002/eval_flue_xnli.sh diff --git a/experiments/exp-001/README.md b/experiments/exp-001/README.md index 9d89cb5..b9a8ca9 100644 --- a/experiments/exp-001/README.md +++ b/experiments/exp-001/README.md @@ -3,20 +3,9 @@ **Dataset**: HF's OSCAR unshuffled_deduplicated_fr **Tokenizer**: byte-level Byte-pair encoding tokenizer (same as GPT-2). Training is identical to the section "Using an existing tokenizer" in huggingface's tokenizer_training [tutorial](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) +tokenizer_name: `/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer` - train the GPT-2 tokenizer with the exact same algorithms and parameters as an existing one. - vocab_size: 50,257 (same as original GPT-2) -**Model Finetuning**: Use [HF's code](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) `run_clm.py` to finetune GPT-2 with the new tokenizer on oscar's fr dataset. -Freeze all the layers of GPT-2 except the word embedding layer `wte` and the positional embedding layer `wpe` by adding the following snippet of codes to `run_clm.py`. -``` -... -for name, param in model.named_parameters(): - if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): - print(f"🥶 Freeze layer '{name}'") - param.requires_grad = False - else: - param.requires_grad = True -... -``` diff --git a/experiments/exp-001/run_clm.py b/experiments/exp-001/run_clm.py index 0529c26..4c1407f 100644 --- a/experiments/exp-001/run_clm.py +++ b/experiments/exp-001/run_clm.py @@ -476,15 +476,6 @@ def group_texts(examples): import gc del tokenized_datasets gc.collect() - - # original training_args.per_device_train_batch_size = 8 - training_args.per_device_train_batch_size = 4 - training_args.gradient_accumulation_steps = 2 - - # original training_args.per_device_eval_batch_size = 8 - training_args.per_device_eval_batch_size = 4 - training_args.eval_accumulation_steps = 2 - # Initialize our Trainer trainer = Trainer( @@ -504,7 +495,7 @@ def group_texts(examples): checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint - print("checkpoint:", checkpoint) + print("Checkpoint:", checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/experiments/exp-001/run_clm.sh b/experiments/exp-001/run_clm.sh index 35bd65a..a88ff21 100644 --- a/experiments/exp-001/run_clm.sh +++ b/experiments/exp-001/run_clm.sh @@ -4,12 +4,13 @@ #SBATCH --time=5-23:59:00 # Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 +#SBATCH --partition=3090-gcondo --gres=gpu:8 # Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=150g +#SBATCH --mem=100g # Specify a job name: #SBATCH -J exp-001-run_clm @@ -28,7 +29,9 @@ source $FP_BIGS/env_lang_mod/bin/activate tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" cache_dir="${FP_BIGS}/data/external/oscar_fr" -output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2" +output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2" +logging_dir="${FP_BIGS}/reports/exp-001/ft-gpt2-2" +ckpt_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2/checkpoint-25000" python $FP_BIGS/scripts/exp-001/run_clm.py \ --model_name_or_path gpt2 \ @@ -36,8 +39,19 @@ python $FP_BIGS/scripts/exp-001/run_clm.py \ --dataset_name oscar \ --cache_dir $cache_dir \ --dataset_config_name unshuffled_deduplicated_fr \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ --do_train \ --do_eval \ --output_dir $output_dir \ --preprocessing_num_workers 8 \ - --overwrite_output_dir \ No newline at end of file + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --resume_from_checkpoint $ckpt_dir \ No newline at end of file diff --git a/experiments/exp-001/run_clm_no_tok.sh b/experiments/exp-001/run_clm_no_tok.sh index 10aafa7..af2be7f 100644 --- a/experiments/exp-001/run_clm_no_tok.sh +++ b/experiments/exp-001/run_clm_no_tok.sh @@ -4,12 +4,12 @@ #SBATCH --time=5-23:59:00 # Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 +#SBATCH --partition=3090-gcondo --gres=gpu:8 # Default resources are 1 core with 2.8GB of memory. # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=150g +#SBATCH --mem=100g # Specify a job name: #SBATCH -J exp-001-run_clm_no_tok @@ -39,4 +39,11 @@ python $FP_BIGS/scripts/exp-001/run_clm.py \ --do_eval \ --output_dir $output_dir \ --preprocessing_num_workers 8 \ - --overwrite_output_dir \ No newline at end of file + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 500 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ No newline at end of file diff --git a/experiments/exp-002/eval_flue_cls.py b/experiments/exp-002/eval_flue_cls.py new file mode 100644 index 0000000..93eb746 --- /dev/null +++ b/experiments/exp-002/eval_flue_cls.py @@ -0,0 +1,99 @@ +from datasets import load_dataset +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +args = parser.parse_args() + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("flue", "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("flue", "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +import torch +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained('/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer') +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] + +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + +from transformers import TrainingArguments + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +from transformers import Trainer +from datasets import load_metric +import numpy as np + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics +) + +trainer.train() + +# best_model = GPT2ForSequenceClassification.from_pretrained(f'{args.output_dir}/checkpoint-9000', +# num_labels=2, +# pad_token_id=0) + +# trainer = Trainer( +# model=best_model, +# args=training_args, +# train_dataset=full_train_dataset, +# eval_dataset=full_test_dataset, +# compute_metrics=compute_metrics +# ) + +# print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-002/eval_flue_cls.sh b/experiments/exp-002/eval_flue_cls.sh new file mode 100644 index 0000000..c22a977 --- /dev/null +++ b/experiments/exp-002/eval_flue_cls.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME +done diff --git a/experiments/exp-002/eval_flue_paws.py b/experiments/exp-002/eval_flue_paws.py new file mode 100644 index 0000000..cd217a1 --- /dev/null +++ b/experiments/exp-002/eval_flue_paws.py @@ -0,0 +1,95 @@ +from datasets import load_dataset +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +args = parser.parse_args() + + +from datasets import load_dataset + +paws_dataset = load_dataset("flue", "PAWS-X", cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +import torch +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained('/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer') +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from transformers import TrainingArguments + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +from transformers import Trainer +from datasets import load_metric +import numpy as np + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics +) + +trainer.train() + +# best_model = GPT2ForSequenceClassification.from_pretrained(f'{args.output_dir}/checkpoint-9000', +# num_labels=2, +# pad_token_id=0) + +# trainer = Trainer( +# model=best_model, +# args=training_args, +# train_dataset=full_train_dataset, +# eval_dataset=full_test_dataset, +# compute_metrics=compute_metrics +# ) + +# print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-002/eval_flue_paws.sh b/experiments/exp-002/eval_flue_paws.sh new file mode 100644 index 0000000..d86eae4 --- /dev/null +++ b/experiments/exp-002/eval_flue_paws.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_paws + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME +done diff --git a/experiments/exp-002/eval_flue_xnli.py b/experiments/exp-002/eval_flue_xnli.py new file mode 100644 index 0000000..84e809d --- /dev/null +++ b/experiments/exp-002/eval_flue_xnli.py @@ -0,0 +1,95 @@ +from datasets import load_dataset +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +args = parser.parse_args() + + +from datasets import load_dataset + +xnli_dataset = load_dataset("flue", "XNLI", cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") +xnli_train_dataset = xnli_dataset['train'] +xnli_val_dataset = xnli_dataset['validation'] +xnli_test_dataset = xnli_dataset['test'] + +import torch +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained('/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer') +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0) + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypo"]}', padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from transformers import TrainingArguments + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +from transformers import Trainer +from datasets import load_metric +import numpy as np + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics +) + +trainer.train() + +# best_model = GPT2ForSequenceClassification.from_pretrained(f'{args.output_dir}/checkpoint-9000', +# num_labels=2, +# pad_token_id=0) + +# trainer = Trainer( +# model=best_model, +# args=training_args, +# train_dataset=full_train_dataset, +# eval_dataset=full_test_dataset, +# compute_metrics=compute_metrics +# ) + +# print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-002/eval_flue_xnli.sh b/experiments/exp-002/eval_flue_xnli.sh new file mode 100644 index 0000000..379555a --- /dev/null +++ b/experiments/exp-002/eval_flue_xnli.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_xnli + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME +done From 5e81cbe41d78355b06c2cf37d21e4b2424d65ed6 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 6 Oct 2021 16:33:18 -0400 Subject: [PATCH 034/142] update README --- experiments/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/experiments/README.md b/experiments/README.md index 513efc3..025733d 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -1,5 +1,6 @@ # Current Experiments - `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr`. +- `exp-002`: evaluate gpt-2 on FLUE's tasks (CLS, XNLI, PAWS) # Carbon Tracking Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) From 19ac169f6bb47d50fd6780ad18a7e8bacc2c2393 Mon Sep 17 00:00:00 2001 From: Yong Zheng Xin Date: Wed, 13 Oct 2021 15:22:57 -0400 Subject: [PATCH 035/142] Update README.md --- experiments/exp-001/README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/experiments/exp-001/README.md b/experiments/exp-001/README.md index b9a8ca9..eb163f7 100644 --- a/experiments/exp-001/README.md +++ b/experiments/exp-001/README.md @@ -1,3 +1,15 @@ +# README + +- use `download_oscar_fr.sh` to download the datasets. To download datasets for other languages, make the necessary changes on line 8 in the `download_oscar_fr.py`. +- run `train_tokenizer_gpt2.py` to train the tokenizer for the new dataset. Make necessary changes on line 8 to load the dataset and line 20 to save the trained tokenizer. +- run `run_clm.sh` to train GPT-2. Important changes to arguments that might be made: + - `tokenizer_dir`: directory of saved tokenizer. + - `cache_dir`: directory of cached dataset from `download_oscar_fr.sh` (remember to make changes to the dataset use in the argument `dataset_name` and `dataset_config_name`). + - `output_dir`: directory where the gpt2 is checkpointed during training. + - `ckpt_dir`: used for continuing training from checkpoint. + +--- + # Decisions **Dataset**: HF's OSCAR unshuffled_deduplicated_fr @@ -7,5 +19,4 @@ tokenizer_name: `/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/osc - train the GPT-2 tokenizer with the exact same algorithms and parameters as an existing one. - vocab_size: 50,257 (same as original GPT-2) - From c3620d24fa87913f8948a8debcff6f8699294dca Mon Sep 17 00:00:00 2001 From: Yong Zheng Xin Date: Wed, 13 Oct 2021 15:23:29 -0400 Subject: [PATCH 036/142] Update README.md --- experiments/exp-001/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/experiments/exp-001/README.md b/experiments/exp-001/README.md index eb163f7..75eb89d 100644 --- a/experiments/exp-001/README.md +++ b/experiments/exp-001/README.md @@ -3,10 +3,10 @@ - use `download_oscar_fr.sh` to download the datasets. To download datasets for other languages, make the necessary changes on line 8 in the `download_oscar_fr.py`. - run `train_tokenizer_gpt2.py` to train the tokenizer for the new dataset. Make necessary changes on line 8 to load the dataset and line 20 to save the trained tokenizer. - run `run_clm.sh` to train GPT-2. Important changes to arguments that might be made: - - `tokenizer_dir`: directory of saved tokenizer. - - `cache_dir`: directory of cached dataset from `download_oscar_fr.sh` (remember to make changes to the dataset use in the argument `dataset_name` and `dataset_config_name`). - - `output_dir`: directory where the gpt2 is checkpointed during training. - - `ckpt_dir`: used for continuing training from checkpoint. + - `tokenizer_dir`: directory of saved tokenizer. + - `cache_dir`: directory of cached dataset from `download_oscar_fr.sh` (remember to make changes to the dataset use in the argument `dataset_name` and `dataset_config_name`). + - `output_dir`: directory where the gpt2 is checkpointed during training. + - `ckpt_dir`: used for continuing training from checkpoint. --- From 049960c18430d06ba3e4e2deababd7a92ee7a93f Mon Sep 17 00:00:00 2001 From: yongzx Date: Fri, 15 Oct 2021 00:29:17 -0400 Subject: [PATCH 037/142] update exp-002 and exp-004 --- experiments/README.md | 3 +- experiments/exp-002/eval_flue_cls.py | 94 +++++++------ experiments/exp-002/eval_flue_cls.sh | 10 +- experiments/exp-002/eval_flue_cls_books.py | 125 +++++++++++++++++ experiments/exp-002/eval_flue_cls_books.sh | 48 +++++++ experiments/exp-002/eval_flue_cls_dvd.py | 125 +++++++++++++++++ experiments/exp-002/eval_flue_cls_dvd.sh | 48 +++++++ experiments/exp-002/eval_flue_cls_music.py | 127 ++++++++++++++++++ experiments/exp-002/eval_flue_cls_music.sh | 48 +++++++ experiments/exp-002/eval_flue_paws.py | 90 +++++++------ experiments/exp-002/eval_flue_paws.sh | 9 +- experiments/exp-002/eval_flue_xnli.py | 94 +++++++------ experiments/exp-002/eval_flue_xnli.sh | 14 +- experiments/exp-002/gpt2_eval_flue_cls.sh | 44 ++++++ .../exp-002/gpt2_eval_flue_cls_books.sh | 46 +++++++ experiments/exp-002/gpt2_eval_flue_cls_dvd.sh | 47 +++++++ .../exp-002/gpt2_eval_flue_cls_music.sh | 46 +++++++ experiments/exp-002/gpt2_eval_flue_paws.sh | 44 ++++++ experiments/exp-004/download_pawsx.py | 9 ++ experiments/exp-004/download_pawsx.sh | 30 +++++ experiments/exp-004/eval_paws_en.py | 106 +++++++++++++++ experiments/exp-004/eval_paws_en.sh | 63 +++++++++ experiments/exp-004/eval_paws_fr.py | 108 +++++++++++++++ experiments/exp-004/eval_paws_fr_ft.sh | 46 +++++++ experiments/exp-004/eval_paws_fr_no_ft.sh | 46 +++++++ .../exp-004/eval_paws_fr_swapped_embedding.py | 117 ++++++++++++++++ .../eval_paws_fr_swapped_embedding_ft.sh | 48 +++++++ .../eval_paws_fr_swapped_embedding_no_ft.sh | 48 +++++++ 28 files changed, 1546 insertions(+), 137 deletions(-) create mode 100644 experiments/exp-002/eval_flue_cls_books.py create mode 100644 experiments/exp-002/eval_flue_cls_books.sh create mode 100644 experiments/exp-002/eval_flue_cls_dvd.py create mode 100644 experiments/exp-002/eval_flue_cls_dvd.sh create mode 100644 experiments/exp-002/eval_flue_cls_music.py create mode 100644 experiments/exp-002/eval_flue_cls_music.sh create mode 100644 experiments/exp-002/gpt2_eval_flue_cls.sh create mode 100644 experiments/exp-002/gpt2_eval_flue_cls_books.sh create mode 100644 experiments/exp-002/gpt2_eval_flue_cls_dvd.sh create mode 100644 experiments/exp-002/gpt2_eval_flue_cls_music.sh create mode 100644 experiments/exp-002/gpt2_eval_flue_paws.sh create mode 100644 experiments/exp-004/download_pawsx.py create mode 100644 experiments/exp-004/download_pawsx.sh create mode 100644 experiments/exp-004/eval_paws_en.py create mode 100644 experiments/exp-004/eval_paws_en.sh create mode 100644 experiments/exp-004/eval_paws_fr.py create mode 100644 experiments/exp-004/eval_paws_fr_ft.sh create mode 100644 experiments/exp-004/eval_paws_fr_no_ft.sh create mode 100644 experiments/exp-004/eval_paws_fr_swapped_embedding.py create mode 100644 experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh create mode 100644 experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh diff --git a/experiments/README.md b/experiments/README.md index 025733d..66c5a09 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -1,6 +1,7 @@ # Current Experiments - `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr`. -- `exp-002`: evaluate gpt-2 on FLUE's tasks (CLS, XNLI, PAWS) +- `exp-002`: evaluate gpt-2-{finetuned on OSCAR-FR, base} on FLUE's tasks (CLS, XNLI, PAWS) +- `exp-004`: evaluate gpt-2 base and swapped-embedding-layers for PAWS-X # Carbon Tracking Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) diff --git a/experiments/exp-002/eval_flue_cls.py b/experiments/exp-002/eval_flue_cls.py index 93eb746..a0fe4c0 100644 --- a/experiments/exp-002/eval_flue_cls.py +++ b/experiments/exp-002/eval_flue_cls.py @@ -1,4 +1,13 @@ -from datasets import load_dataset +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + import argparse parser = argparse.ArgumentParser() parser.add_argument("output_dir") @@ -7,7 +16,11 @@ parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict from datasets import load_dataset @@ -22,25 +35,31 @@ cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") # "PAWS-X", "XNLI", "CLS", "WSD-V" import torch +import numpy as np +from transformers import TrainingArguments, Trainer from transformers import GPT2Tokenizer, GPT2ForSequenceClassification -tokenizer = GPT2Tokenizer.from_pretrained('/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer') -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] - full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) -from transformers import TrainingArguments + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) training_args = TrainingArguments( args.output_dir, @@ -60,40 +79,33 @@ def tokenize_function(examples): load_best_model_at_end=True, ) -from transformers import Trainer -from datasets import load_metric -import numpy as np - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics -) -trainer.train() - -# best_model = GPT2ForSequenceClassification.from_pretrained(f'{args.output_dir}/checkpoint-9000', -# num_labels=2, -# pad_token_id=0) - -# trainer = Trainer( -# model=best_model, -# args=training_args, -# train_dataset=full_train_dataset, -# eval_dataset=full_test_dataset, -# compute_metrics=compute_metrics -# ) - -# print("Evaluate:", trainer.evaluate()) +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) diff --git a/experiments/exp-002/eval_flue_cls.sh b/experiments/exp-002/eval_flue_cls.sh index c22a977..00adecd 100644 --- a/experiments/exp-002/eval_flue_cls.sh +++ b/experiments/exp-002/eval_flue_cls.sh @@ -30,8 +30,10 @@ learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) # learning_rates=( 1e-5 ) for lr in ${learning_rates[@]} ; do echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" + # MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + MODEL_NAME="gpt-2" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" mkdir -p $OUTPUT_DIR python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ @@ -39,5 +41,7 @@ for lr in ${learning_rates[@]} ; do --learning_rate $lr \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train done diff --git a/experiments/exp-002/eval_flue_cls_books.py b/experiments/exp-002/eval_flue_cls_books.py new file mode 100644 index 0000000..141f579 --- /dev/null +++ b/experiments/exp-002/eval_flue_cls_books.py @@ -0,0 +1,125 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +print("Before splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +# split: books +cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="books") +cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="books") +cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="books") + +print("After splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/experiments/exp-002/eval_flue_cls_books.sh b/experiments/exp-002/eval_flue_cls_books.sh new file mode 100644 index 0000000..15a064a --- /dev/null +++ b/experiments/exp-002/eval_flue_cls_books.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls_books + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change books + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + # change books + python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/experiments/exp-002/eval_flue_cls_dvd.py b/experiments/exp-002/eval_flue_cls_dvd.py new file mode 100644 index 0000000..0b8f675 --- /dev/null +++ b/experiments/exp-002/eval_flue_cls_dvd.py @@ -0,0 +1,125 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +print("Before splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +# split: dvd +cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="dvd") +cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="dvd") +cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="dvd") + +print("After splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/experiments/exp-002/eval_flue_cls_dvd.sh b/experiments/exp-002/eval_flue_cls_dvd.sh new file mode 100644 index 0000000..c6c3f39 --- /dev/null +++ b/experiments/exp-002/eval_flue_cls_dvd.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls_dvd + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change dvd + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + # change dvd + python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/experiments/exp-002/eval_flue_cls_music.py b/experiments/exp-002/eval_flue_cls_music.py new file mode 100644 index 0000000..721843c --- /dev/null +++ b/experiments/exp-002/eval_flue_cls_music.py @@ -0,0 +1,127 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +print("Before splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +# split: music +cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="music") +cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="music") +cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="music") + +print("After splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-002/eval_flue_cls_music.sh b/experiments/exp-002/eval_flue_cls_music.sh new file mode 100644 index 0000000..fd29ce9 --- /dev/null +++ b/experiments/exp-002/eval_flue_cls_music.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls_music + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change music + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + # change music + python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/experiments/exp-002/eval_flue_paws.py b/experiments/exp-002/eval_flue_paws.py index cd217a1..c33649a 100644 --- a/experiments/exp-002/eval_flue_paws.py +++ b/experiments/exp-002/eval_flue_paws.py @@ -1,4 +1,13 @@ -from datasets import load_dataset +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + import argparse parser = argparse.ArgumentParser() parser.add_argument("output_dir") @@ -7,7 +16,11 @@ parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict from datasets import load_dataset @@ -18,12 +31,11 @@ paws_test_dataset = paws_dataset['test'] import torch +import numpy as np +from transformers import Trainer, TrainingArguments from transformers import GPT2Tokenizer, GPT2ForSequenceClassification -tokenizer = GPT2Tokenizer.from_pretrained('/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer') -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) def tokenize_function(examples): return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) @@ -36,7 +48,14 @@ def tokenize_function(examples): small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) -from transformers import TrainingArguments +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) training_args = TrainingArguments( args.output_dir, @@ -56,40 +75,29 @@ def tokenize_function(examples): load_best_model_at_end=True, ) -from transformers import Trainer -from datasets import load_metric -import numpy as np - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics -) - -trainer.train() - -# best_model = GPT2ForSequenceClassification.from_pretrained(f'{args.output_dir}/checkpoint-9000', -# num_labels=2, -# pad_token_id=0) - -# trainer = Trainer( -# model=best_model, -# args=training_args, -# train_dataset=full_train_dataset, -# eval_dataset=full_test_dataset, -# compute_metrics=compute_metrics -# ) - -# print("Evaluate:", trainer.evaluate()) +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +if args.do_train: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) diff --git a/experiments/exp-002/eval_flue_paws.sh b/experiments/exp-002/eval_flue_paws.sh index d86eae4..8644967 100644 --- a/experiments/exp-002/eval_flue_paws.sh +++ b/experiments/exp-002/eval_flue_paws.sh @@ -26,12 +26,13 @@ set +a module load python/3.7.4 source $FP_BIGS/env_lang_mod/bin/activate -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 5e-6 ) for lr in ${learning_rates[@]} ; do echo "LR ===== $lr" OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-tok/$lr" MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" mkdir -p $OUTPUT_DIR python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ @@ -39,5 +40,7 @@ for lr in ${learning_rates[@]} ; do --learning_rate $lr \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train done diff --git a/experiments/exp-002/eval_flue_xnli.py b/experiments/exp-002/eval_flue_xnli.py index 84e809d..8d48218 100644 --- a/experiments/exp-002/eval_flue_xnli.py +++ b/experiments/exp-002/eval_flue_xnli.py @@ -1,4 +1,13 @@ -from datasets import load_dataset +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + import argparse parser = argparse.ArgumentParser() parser.add_argument("output_dir") @@ -7,7 +16,11 @@ parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict from datasets import load_dataset @@ -18,25 +31,31 @@ xnli_test_dataset = xnli_dataset['test'] import torch +import numpy as np +from transformers import TrainingArguments, Trainer from transformers import GPT2Tokenizer, GPT2ForSequenceClassification -tokenizer = GPT2Tokenizer.from_pretrained('/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer') -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0) +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) def tokenize_function(examples): return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypo"]}', padding="max_length", truncation=True) tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +full_train_dataset = xnli_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = xnli_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = xnli_test_dataset.map(tokenize_function, batched=False) small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) -from transformers import TrainingArguments +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) training_args = TrainingArguments( args.output_dir, @@ -56,40 +75,29 @@ def tokenize_function(examples): load_best_model_at_end=True, ) -from transformers import Trainer -from datasets import load_metric -import numpy as np - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics -) - -trainer.train() - -# best_model = GPT2ForSequenceClassification.from_pretrained(f'{args.output_dir}/checkpoint-9000', -# num_labels=2, -# pad_token_id=0) - -# trainer = Trainer( -# model=best_model, -# args=training_args, -# train_dataset=full_train_dataset, -# eval_dataset=full_test_dataset, -# compute_metrics=compute_metrics -# ) +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0) -# print("Evaluate:", trainer.evaluate()) +if args.do_train: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) diff --git a/experiments/exp-002/eval_flue_xnli.sh b/experiments/exp-002/eval_flue_xnli.sh index 379555a..619b856 100644 --- a/experiments/exp-002/eval_flue_xnli.sh +++ b/experiments/exp-002/eval_flue_xnli.sh @@ -1,15 +1,16 @@ #!/bin/bash # Request half an hour of runtime: -#SBATCH --time=2-23:59:00 +#SBATCH --time=6-23:59:00 # Ask for the GPU partition and 1 GPU #SBATCH --partition=3090-gcondo --gres=gpu:1 # Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g +#SBATCH --mem=50g # Specify a job name: #SBATCH -J exp-002-eval_flue_xnli @@ -26,18 +27,21 @@ set +a module load python/3.7.4 source $FP_BIGS/env_lang_mod/bin/activate -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 5e-5 1e-6 5e-6 ) # learning_rates=( 1e-5 ) for lr in ${learning_rates[@]} ; do echo "LR ===== $lr" OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" mkdir -p $OUTPUT_DIR python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ - --num_train_epochs 30 \ + --num_train_epochs 10 \ --learning_rate $lr \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train done diff --git a/experiments/exp-002/gpt2_eval_flue_cls.sh b/experiments/exp-002/gpt2_eval_flue_cls.sh new file mode 100644 index 0000000..6cf0746 --- /dev/null +++ b/experiments/exp-002/gpt2_eval_flue_cls.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $MODEL_NAME +done diff --git a/experiments/exp-002/gpt2_eval_flue_cls_books.sh b/experiments/exp-002/gpt2_eval_flue_cls_books.sh new file mode 100644 index 0000000..135e5fb --- /dev/null +++ b/experiments/exp-002/gpt2_eval_flue_cls_books.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls_books + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change books + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + # change books + python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" +done diff --git a/experiments/exp-002/gpt2_eval_flue_cls_dvd.sh b/experiments/exp-002/gpt2_eval_flue_cls_dvd.sh new file mode 100644 index 0000000..2507391 --- /dev/null +++ b/experiments/exp-002/gpt2_eval_flue_cls_dvd.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls_dvd + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change dvd + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + # change dvd + python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" + +done diff --git a/experiments/exp-002/gpt2_eval_flue_cls_music.sh b/experiments/exp-002/gpt2_eval_flue_cls_music.sh new file mode 100644 index 0000000..8b4fc4b --- /dev/null +++ b/experiments/exp-002/gpt2_eval_flue_cls_music.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls_music + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change music + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + # change music + python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" +done diff --git a/experiments/exp-002/gpt2_eval_flue_paws.sh b/experiments/exp-002/gpt2_eval_flue_paws.sh new file mode 100644 index 0000000..7ebb253 --- /dev/null +++ b/experiments/exp-002/gpt2_eval_flue_paws.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_paws + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 5e-6 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" +done diff --git a/experiments/exp-004/download_pawsx.py b/experiments/exp-004/download_pawsx.py new file mode 100644 index 0000000..2d9e976 --- /dev/null +++ b/experiments/exp-004/download_pawsx.py @@ -0,0 +1,9 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("paws-x", 'fr', cache_dir=f"{os.getenv('FP_BIGS')}/data/external/paws-x") +print("Done") \ No newline at end of file diff --git a/experiments/exp-004/download_pawsx.sh b/experiments/exp-004/download_pawsx.sh new file mode 100644 index 0000000..aa9e806 --- /dev/null +++ b/experiments/exp-004/download_pawsx.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-004-download_pawsx + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.err + +# Set up the environment by loading modules +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +python3 $FP_BIGS/scripts/exp-004/download_pawsx.py \ No newline at end of file diff --git a/experiments/exp-004/eval_paws_en.py b/experiments/exp-004/eval_paws_en.py new file mode 100644 index 0000000..7522436 --- /dev/null +++ b/experiments/exp-004/eval_paws_en.py @@ -0,0 +1,106 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + +from datasets import load_dataset +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +paws_dataset = load_dataset("paws-x", 'en', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +import torch +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from transformers import TrainingArguments + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +from transformers import Trainer +from datasets import load_metric +import numpy as np + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +if args.do_train: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-004/eval_paws_en.sh b/experiments/exp-004/eval_paws_en.sh new file mode 100644 index 0000000..9496863 --- /dev/null +++ b/experiments/exp-004/eval_paws_en.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_en + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +# for lr in ${learning_rates[@]} ; do +# echo "LR ===== $lr" +# OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" +# MODEL_NAME="gpt2" +# TOKENIZER_NAME="gpt2" +# mkdir -p $OUTPUT_DIR + +# python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ +# --num_train_epochs 30 \ +# --learning_rate $lr \ +# --per_device_train_batch_size 4 \ +# --gradient_accumulation_steps 4 \ +# --pretrained_model $MODEL_NAME \ +# --tokenizer $TOKENIZER_NAME \ +# --do_train +# done + +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_predict +done \ No newline at end of file diff --git a/experiments/exp-004/eval_paws_fr.py b/experiments/exp-004/eval_paws_fr.py new file mode 100644 index 0000000..180c03e --- /dev/null +++ b/experiments/exp-004/eval_paws_fr.py @@ -0,0 +1,108 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +logging.info("Load Tokenizer") +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +logging.info("Load Raw Dataset") +paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +logging.info("Load Dataset Ready for Training") +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logging.info("Load Metric") +from datasets import load_metric +metric = load_metric("accuracy") +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-004/eval_paws_fr_ft.sh b/experiments/exp-004/eval_paws_fr_ft.sh new file mode 100644 index 0000000..8f14a14 --- /dev/null +++ b/experiments/exp-004/eval_paws_fr_ft.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/experiments/exp-004/eval_paws_fr_no_ft.sh b/experiments/exp-004/eval_paws_fr_no_ft.sh new file mode 100644 index 0000000..5a57c6b --- /dev/null +++ b/experiments/exp-004/eval_paws_fr_no_ft.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_no_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_predict +done diff --git a/experiments/exp-004/eval_paws_fr_swapped_embedding.py b/experiments/exp-004/eval_paws_fr_swapped_embedding.py new file mode 100644 index 0000000..3199a41 --- /dev/null +++ b/experiments/exp-004/eval_paws_fr_swapped_embedding.py @@ -0,0 +1,117 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--fr_gpt2_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +logging.info("Load Tokenizer") +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +logging.info("Load Raw Dataset") +paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +logging.info("Load Dataset Ready for Training") +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logging.info("Load Metric") +from datasets import load_metric +metric = load_metric("accuracy") +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +fr_model = GPT2ForSequenceClassification.from_pretrained(args.fr_gpt2_model, + num_labels=2, + pad_token_id=0) + +# swapped the embedding layers +model.transformer.wte.weight = fr_model.transformer.wte.weight +model.transformer.wpe.weight = fr_model.transformer.wpe.weight + +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh b/experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh new file mode 100644 index 0000000..b177cc9 --- /dev/null +++ b/experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-rp-embedding/$lr" + EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" + FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $EN_MODEL_NAME \ + --fr_gpt2_model $FR_MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh b/experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh new file mode 100644 index 0000000..6af8422 --- /dev/null +++ b/experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_no_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" + EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" + FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $EN_MODEL_NAME \ + --fr_gpt2_model $FR_MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_predict +done From d611b2e4ced91f6c33cfd12ce9c69c7865cf4814 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 18 Oct 2021 15:19:19 -0400 Subject: [PATCH 038/142] requirements.txt --- experiments/requirements.txt | 133 +++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 experiments/requirements.txt diff --git a/experiments/requirements.txt b/experiments/requirements.txt new file mode 100644 index 0000000..a4e486a --- /dev/null +++ b/experiments/requirements.txt @@ -0,0 +1,133 @@ +absl-py==0.14.0 +anyio==3.3.1 +argcomplete==1.12.3 +argon2-cffi==21.1.0 +attrs==21.2.0 +Babel==2.9.1 +backcall==0.2.0 +bleach==4.1.0 +cachetools==4.2.2 +certifi==2021.5.30 +cffi==1.14.6 +charset-normalizer==2.0.4 +click==8.0.1 +configparser==5.0.2 +datasets==1.11.0 +debugpy==1.4.3 +decorator==5.0.9 +defusedxml==0.7.1 +dill==0.3.4 +docker-pycreds==0.4.0 +entrypoints==0.3 +filelock==3.0.12 +fsspec==2021.8.1 +gitdb==4.0.7 +GitPython==3.1.24 +google-auth==1.35.0 +google-auth-oauthlib==0.4.6 +grpcio==1.41.0 +huggingface-hub==0.0.16 +idna==3.2 +importlib-metadata==4.8.1 +ipykernel==6.4.1 +ipython==7.27.0 +ipython-genutils==0.2.0 +ipywidgets==7.6.4 +jedi==0.18.0 +Jinja2==3.0.1 +joblib==1.0.1 +json5==0.9.6 +jsonschema==3.2.0 +jupyter==1.0.0 +jupyter-client==7.0.2 +jupyter-console==6.4.0 +jupyter-core==4.7.1 +jupyter-server==1.11.0 +jupyterlab==3.1.11 +jupyterlab-pygments==0.1.2 +jupyterlab-server==2.8.1 +jupyterlab-widgets==1.0.1 +lxml==4.6.3 +Markdown==3.3.4 +MarkupSafe==2.0.1 +matplotlib-inline==0.1.3 +mistune==0.8.4 +multiprocess==0.70.12.2 +nbclassic==0.3.1 +nbclient==0.5.4 +nbconvert==6.1.0 +nbformat==5.1.3 +nest-asyncio==1.5.1 +notebook==6.4.3 +numpy==1.21.2 +oauthlib==3.1.1 +packaging==21.0 +pandas==1.3.2 +pandocfilters==1.4.3 +parso==0.8.2 +pathtools==0.1.2 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==8.3.2 +prometheus-client==0.11.0 +promise==2.3 +prompt-toolkit==3.0.20 +protobuf==3.18.0 +psutil==5.8.0 +ptyprocess==0.7.0 +pyarrow==5.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +Pygments==2.10.0 +pyparsing==2.4.7 +pyrsistent==0.18.0 +python-dateutil==2.8.2 +python-dotenv==0.19.0 +pytz==2021.1 +PyYAML==5.4.1 +pyzmq==22.2.1 +qtconsole==5.1.1 +QtPy==1.11.0 +regex==2021.8.28 +requests==2.26.0 +requests-oauthlib==1.3.0 +requests-unixsocket==0.2.0 +rsa==4.7.2 +sacremoses==0.0.45 +scikit-learn==0.24.2 +scipy==1.7.1 +Send2Trash==1.8.0 +sentry-sdk==1.4.2 +shortuuid==1.0.1 +six==1.16.0 +sklearn==0.0 +smmap==4.0.0 +sniffio==1.2.0 +subprocess32==3.5.4 +tensorboard==2.6.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.0 +termcolor==1.1.0 +terminado==0.12.1 +testpath==0.5.0 +threadpoolctl==2.2.0 +tokenizers==0.10.3 +torch==1.9.0+cu111 +torchaudio==0.9.0 +torchvision==0.10.0+cu111 +tornado==6.1 +tqdm==4.62.2 +traitlets==5.1.0 +transformers @ git+https://github.com/huggingface/transformers@010965dcde8ce9526f6a7e6e2c3f36276c153708 +typing-extensions==3.10.0.2 +urllib3==1.26.6 +wandb==0.12.2 +wcwidth==0.2.5 +webencodings==0.5.1 +websocket-client==1.2.1 +Werkzeug==2.0.1 +widgetsnbextension==3.5.1 +xxhash==2.0.2 +yaspin==2.1.0 +zipp==3.5.0 From 5fe1e584987a9c08650d40127b6ad67aa755228f Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 27 Oct 2021 15:00:20 -0400 Subject: [PATCH 039/142] update --- scripts/README.md | 7 + scripts/exp-001/README.md | 22 + scripts/exp-001/download_oscar_fr.py | 9 + scripts/exp-001/download_oscar_fr.sh | 30 + scripts/exp-001/run_clm.py | 549 ++++++++++++++++++ scripts/exp-001/run_clm.sh | 57 ++ scripts/exp-001/run_clm_no_tok.sh | 49 ++ scripts/exp-001/train_tokenizer_gpt2.py | 20 + scripts/exp-002/eval_flue_cls.py | 111 ++++ scripts/exp-002/eval_flue_cls.sh | 47 ++ scripts/exp-002/eval_flue_cls_books.py | 125 ++++ scripts/exp-002/eval_flue_cls_books.sh | 48 ++ scripts/exp-002/eval_flue_cls_dvd.py | 125 ++++ scripts/exp-002/eval_flue_cls_dvd.sh | 48 ++ scripts/exp-002/eval_flue_cls_music.py | 127 ++++ scripts/exp-002/eval_flue_cls_music.sh | 48 ++ scripts/exp-002/eval_flue_paws.py | 103 ++++ scripts/exp-002/eval_flue_paws.sh | 46 ++ scripts/exp-002/eval_flue_xnli.py | 132 +++++ scripts/exp-002/eval_flue_xnli.sh | 47 ++ scripts/exp-002/gpt2_eval_flue_cls.sh | 44 ++ scripts/exp-002/gpt2_eval_flue_cls_books.sh | 46 ++ scripts/exp-002/gpt2_eval_flue_cls_dvd.sh | 47 ++ scripts/exp-002/gpt2_eval_flue_cls_music.sh | 46 ++ scripts/exp-002/gpt2_eval_flue_paws.sh | 44 ++ scripts/exp-002/gpt2_eval_flue_xnli.sh | 47 ++ scripts/exp-004/download_pawsx.py | 9 + scripts/exp-004/download_pawsx.sh | 30 + scripts/exp-004/eval_paws_en.py | 106 ++++ scripts/exp-004/eval_paws_en.sh | 63 ++ scripts/exp-004/eval_paws_fr.py | 108 ++++ scripts/exp-004/eval_paws_fr_ft.sh | 46 ++ scripts/exp-004/eval_paws_fr_no_ft.sh | 46 ++ .../exp-004/eval_paws_fr_swapped_embedding.py | 117 ++++ .../eval_paws_fr_swapped_embedding_ft.sh | 48 ++ .../eval_paws_fr_swapped_embedding_no_ft.sh | 48 ++ scripts/requirements.txt | 133 +++++ 37 files changed, 2778 insertions(+) create mode 100644 scripts/README.md create mode 100644 scripts/exp-001/README.md create mode 100644 scripts/exp-001/download_oscar_fr.py create mode 100644 scripts/exp-001/download_oscar_fr.sh create mode 100644 scripts/exp-001/run_clm.py create mode 100644 scripts/exp-001/run_clm.sh create mode 100644 scripts/exp-001/run_clm_no_tok.sh create mode 100644 scripts/exp-001/train_tokenizer_gpt2.py create mode 100644 scripts/exp-002/eval_flue_cls.py create mode 100644 scripts/exp-002/eval_flue_cls.sh create mode 100644 scripts/exp-002/eval_flue_cls_books.py create mode 100644 scripts/exp-002/eval_flue_cls_books.sh create mode 100644 scripts/exp-002/eval_flue_cls_dvd.py create mode 100644 scripts/exp-002/eval_flue_cls_dvd.sh create mode 100644 scripts/exp-002/eval_flue_cls_music.py create mode 100644 scripts/exp-002/eval_flue_cls_music.sh create mode 100644 scripts/exp-002/eval_flue_paws.py create mode 100644 scripts/exp-002/eval_flue_paws.sh create mode 100644 scripts/exp-002/eval_flue_xnli.py create mode 100644 scripts/exp-002/eval_flue_xnli.sh create mode 100644 scripts/exp-002/gpt2_eval_flue_cls.sh create mode 100644 scripts/exp-002/gpt2_eval_flue_cls_books.sh create mode 100644 scripts/exp-002/gpt2_eval_flue_cls_dvd.sh create mode 100644 scripts/exp-002/gpt2_eval_flue_cls_music.sh create mode 100644 scripts/exp-002/gpt2_eval_flue_paws.sh create mode 100644 scripts/exp-002/gpt2_eval_flue_xnli.sh create mode 100644 scripts/exp-004/download_pawsx.py create mode 100644 scripts/exp-004/download_pawsx.sh create mode 100644 scripts/exp-004/eval_paws_en.py create mode 100644 scripts/exp-004/eval_paws_en.sh create mode 100644 scripts/exp-004/eval_paws_fr.py create mode 100644 scripts/exp-004/eval_paws_fr_ft.sh create mode 100644 scripts/exp-004/eval_paws_fr_no_ft.sh create mode 100644 scripts/exp-004/eval_paws_fr_swapped_embedding.py create mode 100644 scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh create mode 100644 scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh create mode 100644 scripts/requirements.txt diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..66c5a09 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,7 @@ +# Current Experiments +- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr`. +- `exp-002`: evaluate gpt-2-{finetuned on OSCAR-FR, base} on FLUE's tasks (CLS, XNLI, PAWS) +- `exp-004`: evaluate gpt-2 base and swapped-embedding-layers for PAWS-X + +# Carbon Tracking +Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) diff --git a/scripts/exp-001/README.md b/scripts/exp-001/README.md new file mode 100644 index 0000000..75eb89d --- /dev/null +++ b/scripts/exp-001/README.md @@ -0,0 +1,22 @@ +# README + +- use `download_oscar_fr.sh` to download the datasets. To download datasets for other languages, make the necessary changes on line 8 in the `download_oscar_fr.py`. +- run `train_tokenizer_gpt2.py` to train the tokenizer for the new dataset. Make necessary changes on line 8 to load the dataset and line 20 to save the trained tokenizer. +- run `run_clm.sh` to train GPT-2. Important changes to arguments that might be made: + - `tokenizer_dir`: directory of saved tokenizer. + - `cache_dir`: directory of cached dataset from `download_oscar_fr.sh` (remember to make changes to the dataset use in the argument `dataset_name` and `dataset_config_name`). + - `output_dir`: directory where the gpt2 is checkpointed during training. + - `ckpt_dir`: used for continuing training from checkpoint. + +--- + +# Decisions + +**Dataset**: HF's OSCAR unshuffled_deduplicated_fr + +**Tokenizer**: byte-level Byte-pair encoding tokenizer (same as GPT-2). Training is identical to the section "Using an existing tokenizer" in huggingface's tokenizer_training [tutorial](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) +tokenizer_name: `/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer` +- train the GPT-2 tokenizer with the exact same algorithms and parameters as an existing one. +- vocab_size: 50,257 (same as original GPT-2) + + diff --git a/scripts/exp-001/download_oscar_fr.py b/scripts/exp-001/download_oscar_fr.py new file mode 100644 index 0000000..fd04779 --- /dev/null +++ b/scripts/exp-001/download_oscar_fr.py @@ -0,0 +1,9 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") +print("Done") \ No newline at end of file diff --git a/scripts/exp-001/download_oscar_fr.sh b/scripts/exp-001/download_oscar_fr.sh new file mode 100644 index 0000000..a558c9e --- /dev/null +++ b/scripts/exp-001/download_oscar_fr.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-031-download_oscar_fr + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.err + +# Set up the environment by loading modules +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +python3 $FP_BIGS/scripts/exp-001/download_oscar_fr.py \ No newline at end of file diff --git a/scripts/exp-001/run_clm.py b/scripts/exp-001/run_clm.py new file mode 100644 index 0000000..4c1407f --- /dev/null +++ b/scripts/exp-001/run_clm.py @@ -0,0 +1,549 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=causal-lm +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import torch +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + model.resize_token_embeddings(len(tokenizer)) + for name, param in model.named_parameters(): + if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): + print(f"🥶 Freeze layer '{name}'") + param.requires_grad = False + else: + param.requires_grad = True + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + print("Sanity check: loaded tokenized_datasets") + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + print("Sanity check: saved tokenized_datasets") + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + print("Sanity check: loaded lm_datasets") + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + print("Sanity check: saved lm_datasets") + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + import gc + del tokenized_datasets + gc.collect() + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + print("Checkpoint:", checkpoint) + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + if training_args.push_to_hub: + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + trainer.push_to_hub(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/scripts/exp-001/run_clm.sh b/scripts/exp-001/run_clm.sh new file mode 100644 index 0000000..a88ff21 --- /dev/null +++ b/scripts/exp-001/run_clm.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=5-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:8 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-001-run_clm + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" +cache_dir="${FP_BIGS}/data/external/oscar_fr" +output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2" +logging_dir="${FP_BIGS}/reports/exp-001/ft-gpt2-2" +ckpt_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2/checkpoint-25000" + +python $FP_BIGS/scripts/exp-001/run_clm.py \ + --model_name_or_path gpt2 \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_fr \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --resume_from_checkpoint $ckpt_dir \ No newline at end of file diff --git a/scripts/exp-001/run_clm_no_tok.sh b/scripts/exp-001/run_clm_no_tok.sh new file mode 100644 index 0000000..af2be7f --- /dev/null +++ b/scripts/exp-001/run_clm_no_tok.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=5-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:8 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-001-run_clm_no_tok + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" +cache_dir="${FP_BIGS}/data/external/oscar_fr" +output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-no-tok" + +python $FP_BIGS/scripts/exp-001/run_clm.py \ + --model_name_or_path gpt2 \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_fr \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 500 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ No newline at end of file diff --git a/scripts/exp-001/train_tokenizer_gpt2.py b/scripts/exp-001/train_tokenizer_gpt2.py new file mode 100644 index 0000000..d8fe237 --- /dev/null +++ b/scripts/exp-001/train_tokenizer_gpt2.py @@ -0,0 +1,20 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") + +def batch_iterator(): + batch_size = 1000 + for i in range(0, len(dataset), batch_size): + yield dataset['train'][i : i + batch_size]["text"] + +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +assert tokenizer.is_fast +new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) +new_tokenizer.save_pretrained(f"{os.getenv('FP_BIGS')}/data/processed/exp-001/oscar-fr-tokenizer") \ No newline at end of file diff --git a/scripts/exp-002/eval_flue_cls.py b/scripts/exp-002/eval_flue_cls.py new file mode 100644 index 0000000..a0fe4c0 --- /dev/null +++ b/scripts/exp-002/eval_flue_cls.py @@ -0,0 +1,111 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("flue", "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("flue", "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-002/eval_flue_cls.sh b/scripts/exp-002/eval_flue_cls.sh new file mode 100644 index 0000000..00adecd --- /dev/null +++ b/scripts/exp-002/eval_flue_cls.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" + # MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + MODEL_NAME="gpt-2" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-002/eval_flue_cls_books.py b/scripts/exp-002/eval_flue_cls_books.py new file mode 100644 index 0000000..141f579 --- /dev/null +++ b/scripts/exp-002/eval_flue_cls_books.py @@ -0,0 +1,125 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +print("Before splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +# split: books +cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="books") +cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="books") +cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="books") + +print("After splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-002/eval_flue_cls_books.sh b/scripts/exp-002/eval_flue_cls_books.sh new file mode 100644 index 0000000..15a064a --- /dev/null +++ b/scripts/exp-002/eval_flue_cls_books.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls_books + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change books + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + # change books + python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-002/eval_flue_cls_dvd.py b/scripts/exp-002/eval_flue_cls_dvd.py new file mode 100644 index 0000000..0b8f675 --- /dev/null +++ b/scripts/exp-002/eval_flue_cls_dvd.py @@ -0,0 +1,125 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +print("Before splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +# split: dvd +cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="dvd") +cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="dvd") +cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="dvd") + +print("After splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-002/eval_flue_cls_dvd.sh b/scripts/exp-002/eval_flue_cls_dvd.sh new file mode 100644 index 0000000..c6c3f39 --- /dev/null +++ b/scripts/exp-002/eval_flue_cls_dvd.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls_dvd + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change dvd + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + # change dvd + python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-002/eval_flue_cls_music.py b/scripts/exp-002/eval_flue_cls_music.py new file mode 100644 index 0000000..721843c --- /dev/null +++ b/scripts/exp-002/eval_flue_cls_music.py @@ -0,0 +1,127 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split=f"train", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) +cls_train_dataset = cls_train_datasetdict['train'] +cls_val_dataset = cls_train_datasetdict['test'] +cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", + "CLS", + split="test", + cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" + +print("Before splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +# split: music +cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="music") +cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="music") +cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="music") + +print("After splitting:") +print(cls_train_dataset) +print(cls_val_dataset) +print(cls_test_dataset) + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) +full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) +full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) + + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-002/eval_flue_cls_music.sh b/scripts/exp-002/eval_flue_cls_music.sh new file mode 100644 index 0000000..fd29ce9 --- /dev/null +++ b/scripts/exp-002/eval_flue_cls_music.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_cls_music + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change music + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + # change music + python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-002/eval_flue_paws.py b/scripts/exp-002/eval_flue_paws.py new file mode 100644 index 0000000..c33649a --- /dev/null +++ b/scripts/exp-002/eval_flue_paws.py @@ -0,0 +1,103 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +paws_dataset = load_dataset("flue", "PAWS-X", cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +import torch +import numpy as np +from transformers import Trainer, TrainingArguments +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +if args.do_train: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-002/eval_flue_paws.sh b/scripts/exp-002/eval_flue_paws.sh new file mode 100644 index 0000000..8644967 --- /dev/null +++ b/scripts/exp-002/eval_flue_paws.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_paws + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 5e-6 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-002/eval_flue_xnli.py b/scripts/exp-002/eval_flue_xnli.py new file mode 100644 index 0000000..b2da78b --- /dev/null +++ b/scripts/exp-002/eval_flue_xnli.py @@ -0,0 +1,132 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric + + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +# load dataset +xnli_dataset = load_dataset("flue", "XNLI", cache_dir=args.cache_dir) +xnli_train_dataset = xnli_dataset['train'] +xnli_val_dataset = xnli_dataset['validation'] +xnli_test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypo"]}', padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = xnli_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = xnli_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = xnli_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +metric = load_metric("accuracy") +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +if args.do_train: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + + model = GPT2ForSequenceClassification.from_pretrained(evaluation_dirs[-1], + num_labels=3, + pad_token_id=0) + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_model = evaluation_dirs[-1] + logging.info(f"Loading trained model from {evaluation_dirs[-1]}") + + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0) + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-002/eval_flue_xnli.sh b/scripts/exp-002/eval_flue_xnli.sh new file mode 100644 index 0000000..619b856 --- /dev/null +++ b/scripts/exp-002/eval_flue_xnli.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=6-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_xnli + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-002/gpt2_eval_flue_cls.sh b/scripts/exp-002/gpt2_eval_flue_cls.sh new file mode 100644 index 0000000..6cf0746 --- /dev/null +++ b/scripts/exp-002/gpt2_eval_flue_cls.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $MODEL_NAME +done diff --git a/scripts/exp-002/gpt2_eval_flue_cls_books.sh b/scripts/exp-002/gpt2_eval_flue_cls_books.sh new file mode 100644 index 0000000..135e5fb --- /dev/null +++ b/scripts/exp-002/gpt2_eval_flue_cls_books.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls_books + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change books + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + # change books + python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" +done diff --git a/scripts/exp-002/gpt2_eval_flue_cls_dvd.sh b/scripts/exp-002/gpt2_eval_flue_cls_dvd.sh new file mode 100644 index 0000000..2507391 --- /dev/null +++ b/scripts/exp-002/gpt2_eval_flue_cls_dvd.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls_dvd + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change dvd + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + # change dvd + python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" + +done diff --git a/scripts/exp-002/gpt2_eval_flue_cls_music.sh b/scripts/exp-002/gpt2_eval_flue_cls_music.sh new file mode 100644 index 0000000..8b4fc4b --- /dev/null +++ b/scripts/exp-002/gpt2_eval_flue_cls_music.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_cls_music + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # change music + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + # change music + python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" +done diff --git a/scripts/exp-002/gpt2_eval_flue_paws.sh b/scripts/exp-002/gpt2_eval_flue_paws.sh new file mode 100644 index 0000000..7ebb253 --- /dev/null +++ b/scripts/exp-002/gpt2_eval_flue_paws.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-002-gpt2_eval_flue_paws + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +# learning_rates=( 5e-6 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-base/$lr" + MODEL_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer "gpt2" +done diff --git a/scripts/exp-002/gpt2_eval_flue_xnli.sh b/scripts/exp-002/gpt2_eval_flue_xnli.sh new file mode 100644 index 0000000..928a3fb --- /dev/null +++ b/scripts/exp-002/gpt2_eval_flue_xnli.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=6-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-002-eval_flue_xnli + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" + MODEL_NAME="gpt2" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/exp-004/download_pawsx.py b/scripts/exp-004/download_pawsx.py new file mode 100644 index 0000000..2d9e976 --- /dev/null +++ b/scripts/exp-004/download_pawsx.py @@ -0,0 +1,9 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("paws-x", 'fr', cache_dir=f"{os.getenv('FP_BIGS')}/data/external/paws-x") +print("Done") \ No newline at end of file diff --git a/scripts/exp-004/download_pawsx.sh b/scripts/exp-004/download_pawsx.sh new file mode 100644 index 0000000..aa9e806 --- /dev/null +++ b/scripts/exp-004/download_pawsx.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-004-download_pawsx + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.err + +# Set up the environment by loading modules +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +python3 $FP_BIGS/scripts/exp-004/download_pawsx.py \ No newline at end of file diff --git a/scripts/exp-004/eval_paws_en.py b/scripts/exp-004/eval_paws_en.py new file mode 100644 index 0000000..7522436 --- /dev/null +++ b/scripts/exp-004/eval_paws_en.py @@ -0,0 +1,106 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + +from datasets import load_dataset +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset + +paws_dataset = load_dataset("paws-x", 'en', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +import torch +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from transformers import TrainingArguments + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +from transformers import Trainer +from datasets import load_metric +import numpy as np + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +if args.do_train: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-004/eval_paws_en.sh b/scripts/exp-004/eval_paws_en.sh new file mode 100644 index 0000000..9496863 --- /dev/null +++ b/scripts/exp-004/eval_paws_en.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_en + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +# for lr in ${learning_rates[@]} ; do +# echo "LR ===== $lr" +# OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" +# MODEL_NAME="gpt2" +# TOKENIZER_NAME="gpt2" +# mkdir -p $OUTPUT_DIR + +# python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ +# --num_train_epochs 30 \ +# --learning_rate $lr \ +# --per_device_train_batch_size 4 \ +# --gradient_accumulation_steps 4 \ +# --pretrained_model $MODEL_NAME \ +# --tokenizer $TOKENIZER_NAME \ +# --do_train +# done + +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_predict +done \ No newline at end of file diff --git a/scripts/exp-004/eval_paws_fr.py b/scripts/exp-004/eval_paws_fr.py new file mode 100644 index 0000000..180c03e --- /dev/null +++ b/scripts/exp-004/eval_paws_fr.py @@ -0,0 +1,108 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +logging.info("Load Tokenizer") +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +logging.info("Load Raw Dataset") +paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +logging.info("Load Dataset Ready for Training") +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logging.info("Load Metric") +from datasets import load_metric +metric = load_metric("accuracy") +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-004/eval_paws_fr_ft.sh b/scripts/exp-004/eval_paws_fr_ft.sh new file mode 100644 index 0000000..8f14a14 --- /dev/null +++ b/scripts/exp-004/eval_paws_fr_ft.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-004/eval_paws_fr_no_ft.sh b/scripts/exp-004/eval_paws_fr_no_ft.sh new file mode 100644 index 0000000..5a57c6b --- /dev/null +++ b/scripts/exp-004/eval_paws_fr_no_ft.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_no_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" + MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" + TOKENIZER_NAME="gpt2" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_predict +done diff --git a/scripts/exp-004/eval_paws_fr_swapped_embedding.py b/scripts/exp-004/eval_paws_fr_swapped_embedding.py new file mode 100644 index 0000000..3199a41 --- /dev/null +++ b/scripts/exp-004/eval_paws_fr_swapped_embedding.py @@ -0,0 +1,117 @@ +import logging +# setup logging +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.INFO, +) +logging.getLogger().addHandler(logging.StreamHandler()) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--fr_gpt2_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +args = parser.parse_args() +assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict + + +from datasets import load_dataset +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +logging.info("Load Tokenizer") +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) + +logging.info("Load Raw Dataset") +paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") +paws_train_dataset = paws_dataset['train'] +paws_val_dataset = paws_dataset['validation'] +paws_test_dataset = paws_dataset['test'] + +def tokenize_function(examples): + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) + +logging.info("Load Dataset Ready for Training") +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) +full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) +full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logging.info("Load Metric") +from datasets import load_metric +metric = load_metric("accuracy") +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=2, + pad_token_id=0) + +fr_model = GPT2ForSequenceClassification.from_pretrained(args.fr_gpt2_model, + num_labels=2, + pad_token_id=0) + +# swapped the embedding layers +model.transformer.wte.weight = fr_model.transformer.wte.weight +model.transformer.wpe.weight = fr_model.transformer.wpe.weight + +if args.do_train: + logging.info("Start Training") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=full_train_dataset, + eval_dataset=full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + logging.info("Start Evaluation") + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate:", trainer.evaluate()) + + diff --git a/scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh b/scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh new file mode 100644 index 0000000..b177cc9 --- /dev/null +++ b/scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-rp-embedding/$lr" + EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" + FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $EN_MODEL_NAME \ + --fr_gpt2_model $FR_MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train +done diff --git a/scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh b/scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh new file mode 100644 index 0000000..6af8422 --- /dev/null +++ b/scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_no_ft + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" + EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" + FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" + TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ + --num_train_epochs 30 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $EN_MODEL_NAME \ + --fr_gpt2_model $FR_MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_predict +done diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..a4e486a --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,133 @@ +absl-py==0.14.0 +anyio==3.3.1 +argcomplete==1.12.3 +argon2-cffi==21.1.0 +attrs==21.2.0 +Babel==2.9.1 +backcall==0.2.0 +bleach==4.1.0 +cachetools==4.2.2 +certifi==2021.5.30 +cffi==1.14.6 +charset-normalizer==2.0.4 +click==8.0.1 +configparser==5.0.2 +datasets==1.11.0 +debugpy==1.4.3 +decorator==5.0.9 +defusedxml==0.7.1 +dill==0.3.4 +docker-pycreds==0.4.0 +entrypoints==0.3 +filelock==3.0.12 +fsspec==2021.8.1 +gitdb==4.0.7 +GitPython==3.1.24 +google-auth==1.35.0 +google-auth-oauthlib==0.4.6 +grpcio==1.41.0 +huggingface-hub==0.0.16 +idna==3.2 +importlib-metadata==4.8.1 +ipykernel==6.4.1 +ipython==7.27.0 +ipython-genutils==0.2.0 +ipywidgets==7.6.4 +jedi==0.18.0 +Jinja2==3.0.1 +joblib==1.0.1 +json5==0.9.6 +jsonschema==3.2.0 +jupyter==1.0.0 +jupyter-client==7.0.2 +jupyter-console==6.4.0 +jupyter-core==4.7.1 +jupyter-server==1.11.0 +jupyterlab==3.1.11 +jupyterlab-pygments==0.1.2 +jupyterlab-server==2.8.1 +jupyterlab-widgets==1.0.1 +lxml==4.6.3 +Markdown==3.3.4 +MarkupSafe==2.0.1 +matplotlib-inline==0.1.3 +mistune==0.8.4 +multiprocess==0.70.12.2 +nbclassic==0.3.1 +nbclient==0.5.4 +nbconvert==6.1.0 +nbformat==5.1.3 +nest-asyncio==1.5.1 +notebook==6.4.3 +numpy==1.21.2 +oauthlib==3.1.1 +packaging==21.0 +pandas==1.3.2 +pandocfilters==1.4.3 +parso==0.8.2 +pathtools==0.1.2 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==8.3.2 +prometheus-client==0.11.0 +promise==2.3 +prompt-toolkit==3.0.20 +protobuf==3.18.0 +psutil==5.8.0 +ptyprocess==0.7.0 +pyarrow==5.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +Pygments==2.10.0 +pyparsing==2.4.7 +pyrsistent==0.18.0 +python-dateutil==2.8.2 +python-dotenv==0.19.0 +pytz==2021.1 +PyYAML==5.4.1 +pyzmq==22.2.1 +qtconsole==5.1.1 +QtPy==1.11.0 +regex==2021.8.28 +requests==2.26.0 +requests-oauthlib==1.3.0 +requests-unixsocket==0.2.0 +rsa==4.7.2 +sacremoses==0.0.45 +scikit-learn==0.24.2 +scipy==1.7.1 +Send2Trash==1.8.0 +sentry-sdk==1.4.2 +shortuuid==1.0.1 +six==1.16.0 +sklearn==0.0 +smmap==4.0.0 +sniffio==1.2.0 +subprocess32==3.5.4 +tensorboard==2.6.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.0 +termcolor==1.1.0 +terminado==0.12.1 +testpath==0.5.0 +threadpoolctl==2.2.0 +tokenizers==0.10.3 +torch==1.9.0+cu111 +torchaudio==0.9.0 +torchvision==0.10.0+cu111 +tornado==6.1 +tqdm==4.62.2 +traitlets==5.1.0 +transformers @ git+https://github.com/huggingface/transformers@010965dcde8ce9526f6a7e6e2c3f36276c153708 +typing-extensions==3.10.0.2 +urllib3==1.26.6 +wandb==0.12.2 +wcwidth==0.2.5 +webencodings==0.5.1 +websocket-client==1.2.1 +Werkzeug==2.0.1 +widgetsnbextension==3.5.1 +xxhash==2.0.2 +yaspin==2.1.0 +zipp==3.5.0 From d5beb9c409d3cdc29a9751ee801325eb863c5ae8 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 27 Oct 2021 15:02:17 -0400 Subject: [PATCH 040/142] remove experiments folder --- experiments/README.md | 7 - experiments/exp-001/README.md | 22 - experiments/exp-001/download_oscar_fr.py | 9 - experiments/exp-001/download_oscar_fr.sh | 30 - experiments/exp-001/run_clm.py | 549 ------------------ experiments/exp-001/run_clm.sh | 57 -- experiments/exp-001/run_clm_no_tok.sh | 49 -- experiments/exp-001/train_tokenizer_gpt2.py | 20 - experiments/exp-002/eval_flue_cls.py | 111 ---- experiments/exp-002/eval_flue_cls.sh | 47 -- experiments/exp-002/eval_flue_cls_books.py | 125 ---- experiments/exp-002/eval_flue_cls_books.sh | 48 -- experiments/exp-002/eval_flue_cls_dvd.py | 125 ---- experiments/exp-002/eval_flue_cls_dvd.sh | 48 -- experiments/exp-002/eval_flue_cls_music.py | 127 ---- experiments/exp-002/eval_flue_cls_music.sh | 48 -- experiments/exp-002/eval_flue_paws.py | 103 ---- experiments/exp-002/eval_flue_paws.sh | 46 -- experiments/exp-002/eval_flue_xnli.py | 103 ---- experiments/exp-002/eval_flue_xnli.sh | 47 -- experiments/exp-002/gpt2_eval_flue_cls.sh | 44 -- .../exp-002/gpt2_eval_flue_cls_books.sh | 46 -- experiments/exp-002/gpt2_eval_flue_cls_dvd.sh | 47 -- .../exp-002/gpt2_eval_flue_cls_music.sh | 46 -- experiments/exp-002/gpt2_eval_flue_paws.sh | 44 -- experiments/exp-004/download_pawsx.py | 9 - experiments/exp-004/download_pawsx.sh | 30 - experiments/exp-004/eval_paws_en.py | 106 ---- experiments/exp-004/eval_paws_en.sh | 63 -- experiments/exp-004/eval_paws_fr.py | 108 ---- experiments/exp-004/eval_paws_fr_ft.sh | 46 -- experiments/exp-004/eval_paws_fr_no_ft.sh | 46 -- .../exp-004/eval_paws_fr_swapped_embedding.py | 117 ---- .../eval_paws_fr_swapped_embedding_ft.sh | 48 -- .../eval_paws_fr_swapped_embedding_no_ft.sh | 48 -- experiments/requirements.txt | 133 ----- 36 files changed, 2702 deletions(-) delete mode 100644 experiments/README.md delete mode 100644 experiments/exp-001/README.md delete mode 100644 experiments/exp-001/download_oscar_fr.py delete mode 100644 experiments/exp-001/download_oscar_fr.sh delete mode 100644 experiments/exp-001/run_clm.py delete mode 100644 experiments/exp-001/run_clm.sh delete mode 100644 experiments/exp-001/run_clm_no_tok.sh delete mode 100644 experiments/exp-001/train_tokenizer_gpt2.py delete mode 100644 experiments/exp-002/eval_flue_cls.py delete mode 100644 experiments/exp-002/eval_flue_cls.sh delete mode 100644 experiments/exp-002/eval_flue_cls_books.py delete mode 100644 experiments/exp-002/eval_flue_cls_books.sh delete mode 100644 experiments/exp-002/eval_flue_cls_dvd.py delete mode 100644 experiments/exp-002/eval_flue_cls_dvd.sh delete mode 100644 experiments/exp-002/eval_flue_cls_music.py delete mode 100644 experiments/exp-002/eval_flue_cls_music.sh delete mode 100644 experiments/exp-002/eval_flue_paws.py delete mode 100644 experiments/exp-002/eval_flue_paws.sh delete mode 100644 experiments/exp-002/eval_flue_xnli.py delete mode 100644 experiments/exp-002/eval_flue_xnli.sh delete mode 100644 experiments/exp-002/gpt2_eval_flue_cls.sh delete mode 100644 experiments/exp-002/gpt2_eval_flue_cls_books.sh delete mode 100644 experiments/exp-002/gpt2_eval_flue_cls_dvd.sh delete mode 100644 experiments/exp-002/gpt2_eval_flue_cls_music.sh delete mode 100644 experiments/exp-002/gpt2_eval_flue_paws.sh delete mode 100644 experiments/exp-004/download_pawsx.py delete mode 100644 experiments/exp-004/download_pawsx.sh delete mode 100644 experiments/exp-004/eval_paws_en.py delete mode 100644 experiments/exp-004/eval_paws_en.sh delete mode 100644 experiments/exp-004/eval_paws_fr.py delete mode 100644 experiments/exp-004/eval_paws_fr_ft.sh delete mode 100644 experiments/exp-004/eval_paws_fr_no_ft.sh delete mode 100644 experiments/exp-004/eval_paws_fr_swapped_embedding.py delete mode 100644 experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh delete mode 100644 experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh delete mode 100644 experiments/requirements.txt diff --git a/experiments/README.md b/experiments/README.md deleted file mode 100644 index 66c5a09..0000000 --- a/experiments/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Current Experiments -- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr`. -- `exp-002`: evaluate gpt-2-{finetuned on OSCAR-FR, base} on FLUE's tasks (CLS, XNLI, PAWS) -- `exp-004`: evaluate gpt-2 base and swapped-embedding-layers for PAWS-X - -# Carbon Tracking -Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) diff --git a/experiments/exp-001/README.md b/experiments/exp-001/README.md deleted file mode 100644 index 75eb89d..0000000 --- a/experiments/exp-001/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# README - -- use `download_oscar_fr.sh` to download the datasets. To download datasets for other languages, make the necessary changes on line 8 in the `download_oscar_fr.py`. -- run `train_tokenizer_gpt2.py` to train the tokenizer for the new dataset. Make necessary changes on line 8 to load the dataset and line 20 to save the trained tokenizer. -- run `run_clm.sh` to train GPT-2. Important changes to arguments that might be made: - - `tokenizer_dir`: directory of saved tokenizer. - - `cache_dir`: directory of cached dataset from `download_oscar_fr.sh` (remember to make changes to the dataset use in the argument `dataset_name` and `dataset_config_name`). - - `output_dir`: directory where the gpt2 is checkpointed during training. - - `ckpt_dir`: used for continuing training from checkpoint. - ---- - -# Decisions - -**Dataset**: HF's OSCAR unshuffled_deduplicated_fr - -**Tokenizer**: byte-level Byte-pair encoding tokenizer (same as GPT-2). Training is identical to the section "Using an existing tokenizer" in huggingface's tokenizer_training [tutorial](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) -tokenizer_name: `/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer` -- train the GPT-2 tokenizer with the exact same algorithms and parameters as an existing one. -- vocab_size: 50,257 (same as original GPT-2) - - diff --git a/experiments/exp-001/download_oscar_fr.py b/experiments/exp-001/download_oscar_fr.py deleted file mode 100644 index fd04779..0000000 --- a/experiments/exp-001/download_oscar_fr.py +++ /dev/null @@ -1,9 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") -print("Done") \ No newline at end of file diff --git a/experiments/exp-001/download_oscar_fr.sh b/experiments/exp-001/download_oscar_fr.sh deleted file mode 100644 index a558c9e..0000000 --- a/experiments/exp-001/download_oscar_fr.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=3-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-031-download_oscar_fr - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.err - -# Set up the environment by loading modules -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -python3 $FP_BIGS/scripts/exp-001/download_oscar_fr.py \ No newline at end of file diff --git a/experiments/exp-001/run_clm.py b/experiments/exp-001/run_clm.py deleted file mode 100644 index 4c1407f..0000000 --- a/experiments/exp-001/run_clm.py +++ /dev/null @@ -1,549 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. - -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=causal-lm -""" -# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. - -import torch -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional -import pathlib - -import datasets -from datasets import load_dataset - -import transformers -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - Trainer, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.testing_utils import CaptureLogger -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.11.0.dev0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - block_size: Optional[int] = field( - default=None, - metadata={ - "help": "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - model.resize_token_embeddings(len(tokenizer)) - for name, param in model.named_parameters(): - if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): - print(f"🥶 Freeze layer '{name}'") - param.requires_grad = False - else: - param.requires_grad = True - - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - - def tokenize_function(examples): - with CaptureLogger(tok_logger) as cl: - output = tokenizer(examples[text_column_name]) - # clm input could be much much longer than block_size - if "Token indices sequence length is longer than the" in cl.out: - tok_logger.warning( - "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." - ) - return output - - with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): - tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - print("Sanity check: loaded tokenized_datasets") - else: - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - print("Sanity check: saved tokenized_datasets") - - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): - lm_datasets = torch.load(str(saved_lm_datasets_fp)) - print("Sanity check: loaded lm_datasets") - else: - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - torch.save(lm_datasets, saved_lm_datasets_fp) - print("Sanity check: saved lm_datasets") - - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = lm_datasets["train"] - if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_dataset = lm_datasets["validation"] - if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - import gc - del tokenized_datasets - gc.collect() - - # Initialize our Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - print("Checkpoint:", checkpoint) - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - if training_args.push_to_hub: - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - trainer.push_to_hub(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/experiments/exp-001/run_clm.sh b/experiments/exp-001/run_clm.sh deleted file mode 100644 index a88ff21..0000000 --- a/experiments/exp-001/run_clm.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=5-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:8 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-001-run_clm - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" -cache_dir="${FP_BIGS}/data/external/oscar_fr" -output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2" -logging_dir="${FP_BIGS}/reports/exp-001/ft-gpt2-2" -ckpt_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2/checkpoint-25000" - -python $FP_BIGS/scripts/exp-001/run_clm.py \ - --model_name_or_path gpt2 \ - --tokenizer_name $tokenizer_dir \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_fr \ - --logging_dir $logging_dir \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 1000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ - --resume_from_checkpoint $ckpt_dir \ No newline at end of file diff --git a/experiments/exp-001/run_clm_no_tok.sh b/experiments/exp-001/run_clm_no_tok.sh deleted file mode 100644 index af2be7f..0000000 --- a/experiments/exp-001/run_clm_no_tok.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=5-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:8 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-001-run_clm_no_tok - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" -cache_dir="${FP_BIGS}/data/external/oscar_fr" -output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-no-tok" - -python $FP_BIGS/scripts/exp-001/run_clm.py \ - --model_name_or_path gpt2 \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_fr \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 500 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ No newline at end of file diff --git a/experiments/exp-001/train_tokenizer_gpt2.py b/experiments/exp-001/train_tokenizer_gpt2.py deleted file mode 100644 index d8fe237..0000000 --- a/experiments/exp-001/train_tokenizer_gpt2.py +++ /dev/null @@ -1,20 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") - -def batch_iterator(): - batch_size = 1000 - for i in range(0, len(dataset), batch_size): - yield dataset['train'][i : i + batch_size]["text"] - -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("gpt2") -assert tokenizer.is_fast -new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) -new_tokenizer.save_pretrained(f"{os.getenv('FP_BIGS')}/data/processed/exp-001/oscar-fr-tokenizer") \ No newline at end of file diff --git a/experiments/exp-002/eval_flue_cls.py b/experiments/exp-002/eval_flue_cls.py deleted file mode 100644 index a0fe4c0..0000000 --- a/experiments/exp-002/eval_flue_cls.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("flue", "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("flue", "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-002/eval_flue_cls.sh b/experiments/exp-002/eval_flue_cls.sh deleted file mode 100644 index 00adecd..0000000 --- a/experiments/exp-002/eval_flue_cls.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" - # MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - MODEL_NAME="gpt-2" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-002/eval_flue_cls_books.py b/experiments/exp-002/eval_flue_cls_books.py deleted file mode 100644 index 141f579..0000000 --- a/experiments/exp-002/eval_flue_cls_books.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -print("Before splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -# split: books -cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="books") -cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="books") -cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="books") - -print("After splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/experiments/exp-002/eval_flue_cls_books.sh b/experiments/exp-002/eval_flue_cls_books.sh deleted file mode 100644 index 15a064a..0000000 --- a/experiments/exp-002/eval_flue_cls_books.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls_books - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change books - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - # change books - python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-002/eval_flue_cls_dvd.py b/experiments/exp-002/eval_flue_cls_dvd.py deleted file mode 100644 index 0b8f675..0000000 --- a/experiments/exp-002/eval_flue_cls_dvd.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -print("Before splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -# split: dvd -cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="dvd") -cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="dvd") -cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="dvd") - -print("After splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/experiments/exp-002/eval_flue_cls_dvd.sh b/experiments/exp-002/eval_flue_cls_dvd.sh deleted file mode 100644 index c6c3f39..0000000 --- a/experiments/exp-002/eval_flue_cls_dvd.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls_dvd - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change dvd - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - # change dvd - python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-002/eval_flue_cls_music.py b/experiments/exp-002/eval_flue_cls_music.py deleted file mode 100644 index 721843c..0000000 --- a/experiments/exp-002/eval_flue_cls_music.py +++ /dev/null @@ -1,127 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -print("Before splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -# split: music -cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="music") -cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="music") -cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="music") - -print("After splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-002/eval_flue_cls_music.sh b/experiments/exp-002/eval_flue_cls_music.sh deleted file mode 100644 index fd29ce9..0000000 --- a/experiments/exp-002/eval_flue_cls_music.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls_music - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change music - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - # change music - python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-002/eval_flue_paws.py b/experiments/exp-002/eval_flue_paws.py deleted file mode 100644 index c33649a..0000000 --- a/experiments/exp-002/eval_flue_paws.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -paws_dataset = load_dataset("flue", "PAWS-X", cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -import torch -import numpy as np -from transformers import Trainer, TrainingArguments -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -if args.do_train: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-002/eval_flue_paws.sh b/experiments/exp-002/eval_flue_paws.sh deleted file mode 100644 index 8644967..0000000 --- a/experiments/exp-002/eval_flue_paws.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_paws - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 5e-6 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-002/eval_flue_xnli.py b/experiments/exp-002/eval_flue_xnli.py deleted file mode 100644 index 8d48218..0000000 --- a/experiments/exp-002/eval_flue_xnli.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -xnli_dataset = load_dataset("flue", "XNLI", cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") -xnli_train_dataset = xnli_dataset['train'] -xnli_val_dataset = xnli_dataset['validation'] -xnli_test_dataset = xnli_dataset['test'] - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypo"]}', padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = xnli_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = xnli_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = xnli_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0) - -if args.do_train: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-002/eval_flue_xnli.sh b/experiments/exp-002/eval_flue_xnli.sh deleted file mode 100644 index 619b856..0000000 --- a/experiments/exp-002/eval_flue_xnli.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=6-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_xnli - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-002/gpt2_eval_flue_cls.sh b/experiments/exp-002/gpt2_eval_flue_cls.sh deleted file mode 100644 index 6cf0746..0000000 --- a/experiments/exp-002/gpt2_eval_flue_cls.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $MODEL_NAME -done diff --git a/experiments/exp-002/gpt2_eval_flue_cls_books.sh b/experiments/exp-002/gpt2_eval_flue_cls_books.sh deleted file mode 100644 index 135e5fb..0000000 --- a/experiments/exp-002/gpt2_eval_flue_cls_books.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls_books - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change books - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - # change books - python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" -done diff --git a/experiments/exp-002/gpt2_eval_flue_cls_dvd.sh b/experiments/exp-002/gpt2_eval_flue_cls_dvd.sh deleted file mode 100644 index 2507391..0000000 --- a/experiments/exp-002/gpt2_eval_flue_cls_dvd.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls_dvd - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change dvd - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - # change dvd - python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" - -done diff --git a/experiments/exp-002/gpt2_eval_flue_cls_music.sh b/experiments/exp-002/gpt2_eval_flue_cls_music.sh deleted file mode 100644 index 8b4fc4b..0000000 --- a/experiments/exp-002/gpt2_eval_flue_cls_music.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls_music - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change music - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - # change music - python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" -done diff --git a/experiments/exp-002/gpt2_eval_flue_paws.sh b/experiments/exp-002/gpt2_eval_flue_paws.sh deleted file mode 100644 index 7ebb253..0000000 --- a/experiments/exp-002/gpt2_eval_flue_paws.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_paws - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 5e-6 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" -done diff --git a/experiments/exp-004/download_pawsx.py b/experiments/exp-004/download_pawsx.py deleted file mode 100644 index 2d9e976..0000000 --- a/experiments/exp-004/download_pawsx.py +++ /dev/null @@ -1,9 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("paws-x", 'fr', cache_dir=f"{os.getenv('FP_BIGS')}/data/external/paws-x") -print("Done") \ No newline at end of file diff --git a/experiments/exp-004/download_pawsx.sh b/experiments/exp-004/download_pawsx.sh deleted file mode 100644 index aa9e806..0000000 --- a/experiments/exp-004/download_pawsx.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=3-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-004-download_pawsx - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.err - -# Set up the environment by loading modules -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -python3 $FP_BIGS/scripts/exp-004/download_pawsx.py \ No newline at end of file diff --git a/experiments/exp-004/eval_paws_en.py b/experiments/exp-004/eval_paws_en.py deleted file mode 100644 index 7522436..0000000 --- a/experiments/exp-004/eval_paws_en.py +++ /dev/null @@ -1,106 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - -from datasets import load_dataset -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -paws_dataset = load_dataset("paws-x", 'en', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -import torch -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from transformers import TrainingArguments - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -from transformers import Trainer -from datasets import load_metric -import numpy as np - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -if args.do_train: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-004/eval_paws_en.sh b/experiments/exp-004/eval_paws_en.sh deleted file mode 100644 index 9496863..0000000 --- a/experiments/exp-004/eval_paws_en.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_en - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -# for lr in ${learning_rates[@]} ; do -# echo "LR ===== $lr" -# OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" -# MODEL_NAME="gpt2" -# TOKENIZER_NAME="gpt2" -# mkdir -p $OUTPUT_DIR - -# python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ -# --num_train_epochs 30 \ -# --learning_rate $lr \ -# --per_device_train_batch_size 4 \ -# --gradient_accumulation_steps 4 \ -# --pretrained_model $MODEL_NAME \ -# --tokenizer $TOKENIZER_NAME \ -# --do_train -# done - -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_predict -done \ No newline at end of file diff --git a/experiments/exp-004/eval_paws_fr.py b/experiments/exp-004/eval_paws_fr.py deleted file mode 100644 index 180c03e..0000000 --- a/experiments/exp-004/eval_paws_fr.py +++ /dev/null @@ -1,108 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -logging.info("Load Tokenizer") -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -logging.info("Load Raw Dataset") -paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -logging.info("Load Dataset Ready for Training") -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logging.info("Load Metric") -from datasets import load_metric -metric = load_metric("accuracy") -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-004/eval_paws_fr_ft.sh b/experiments/exp-004/eval_paws_fr_ft.sh deleted file mode 100644 index 8f14a14..0000000 --- a/experiments/exp-004/eval_paws_fr_ft.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-004/eval_paws_fr_no_ft.sh b/experiments/exp-004/eval_paws_fr_no_ft.sh deleted file mode 100644 index 5a57c6b..0000000 --- a/experiments/exp-004/eval_paws_fr_no_ft.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_no_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_predict -done diff --git a/experiments/exp-004/eval_paws_fr_swapped_embedding.py b/experiments/exp-004/eval_paws_fr_swapped_embedding.py deleted file mode 100644 index 3199a41..0000000 --- a/experiments/exp-004/eval_paws_fr_swapped_embedding.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--fr_gpt2_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -logging.info("Load Tokenizer") -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -logging.info("Load Raw Dataset") -paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -logging.info("Load Dataset Ready for Training") -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logging.info("Load Metric") -from datasets import load_metric -metric = load_metric("accuracy") -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -fr_model = GPT2ForSequenceClassification.from_pretrained(args.fr_gpt2_model, - num_labels=2, - pad_token_id=0) - -# swapped the embedding layers -model.transformer.wte.weight = fr_model.transformer.wte.weight -model.transformer.wpe.weight = fr_model.transformer.wpe.weight - -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh b/experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh deleted file mode 100644 index b177cc9..0000000 --- a/experiments/exp-004/eval_paws_fr_swapped_embedding_ft.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-rp-embedding/$lr" - EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" - FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $EN_MODEL_NAME \ - --fr_gpt2_model $FR_MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh b/experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh deleted file mode 100644 index 6af8422..0000000 --- a/experiments/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_no_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" - EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" - FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $EN_MODEL_NAME \ - --fr_gpt2_model $FR_MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_predict -done diff --git a/experiments/requirements.txt b/experiments/requirements.txt deleted file mode 100644 index a4e486a..0000000 --- a/experiments/requirements.txt +++ /dev/null @@ -1,133 +0,0 @@ -absl-py==0.14.0 -anyio==3.3.1 -argcomplete==1.12.3 -argon2-cffi==21.1.0 -attrs==21.2.0 -Babel==2.9.1 -backcall==0.2.0 -bleach==4.1.0 -cachetools==4.2.2 -certifi==2021.5.30 -cffi==1.14.6 -charset-normalizer==2.0.4 -click==8.0.1 -configparser==5.0.2 -datasets==1.11.0 -debugpy==1.4.3 -decorator==5.0.9 -defusedxml==0.7.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.3 -filelock==3.0.12 -fsspec==2021.8.1 -gitdb==4.0.7 -GitPython==3.1.24 -google-auth==1.35.0 -google-auth-oauthlib==0.4.6 -grpcio==1.41.0 -huggingface-hub==0.0.16 -idna==3.2 -importlib-metadata==4.8.1 -ipykernel==6.4.1 -ipython==7.27.0 -ipython-genutils==0.2.0 -ipywidgets==7.6.4 -jedi==0.18.0 -Jinja2==3.0.1 -joblib==1.0.1 -json5==0.9.6 -jsonschema==3.2.0 -jupyter==1.0.0 -jupyter-client==7.0.2 -jupyter-console==6.4.0 -jupyter-core==4.7.1 -jupyter-server==1.11.0 -jupyterlab==3.1.11 -jupyterlab-pygments==0.1.2 -jupyterlab-server==2.8.1 -jupyterlab-widgets==1.0.1 -lxml==4.6.3 -Markdown==3.3.4 -MarkupSafe==2.0.1 -matplotlib-inline==0.1.3 -mistune==0.8.4 -multiprocess==0.70.12.2 -nbclassic==0.3.1 -nbclient==0.5.4 -nbconvert==6.1.0 -nbformat==5.1.3 -nest-asyncio==1.5.1 -notebook==6.4.3 -numpy==1.21.2 -oauthlib==3.1.1 -packaging==21.0 -pandas==1.3.2 -pandocfilters==1.4.3 -parso==0.8.2 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -Pillow==8.3.2 -prometheus-client==0.11.0 -promise==2.3 -prompt-toolkit==3.0.20 -protobuf==3.18.0 -psutil==5.8.0 -ptyprocess==0.7.0 -pyarrow==5.0.0 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pycparser==2.20 -Pygments==2.10.0 -pyparsing==2.4.7 -pyrsistent==0.18.0 -python-dateutil==2.8.2 -python-dotenv==0.19.0 -pytz==2021.1 -PyYAML==5.4.1 -pyzmq==22.2.1 -qtconsole==5.1.1 -QtPy==1.11.0 -regex==2021.8.28 -requests==2.26.0 -requests-oauthlib==1.3.0 -requests-unixsocket==0.2.0 -rsa==4.7.2 -sacremoses==0.0.45 -scikit-learn==0.24.2 -scipy==1.7.1 -Send2Trash==1.8.0 -sentry-sdk==1.4.2 -shortuuid==1.0.1 -six==1.16.0 -sklearn==0.0 -smmap==4.0.0 -sniffio==1.2.0 -subprocess32==3.5.4 -tensorboard==2.6.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.0 -termcolor==1.1.0 -terminado==0.12.1 -testpath==0.5.0 -threadpoolctl==2.2.0 -tokenizers==0.10.3 -torch==1.9.0+cu111 -torchaudio==0.9.0 -torchvision==0.10.0+cu111 -tornado==6.1 -tqdm==4.62.2 -traitlets==5.1.0 -transformers @ git+https://github.com/huggingface/transformers@010965dcde8ce9526f6a7e6e2c3f36276c153708 -typing-extensions==3.10.0.2 -urllib3==1.26.6 -wandb==0.12.2 -wcwidth==0.2.5 -webencodings==0.5.1 -websocket-client==1.2.1 -Werkzeug==2.0.1 -widgetsnbextension==3.5.1 -xxhash==2.0.2 -yaspin==2.1.0 -zipp==3.5.0 From f751bfbba502d05b9b82da1466f39d59e354de91 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 27 Oct 2021 15:19:10 -0400 Subject: [PATCH 041/142] korean exp-001 --- scripts/exp-001/download_oscar_fr.py | 9 -- scripts/exp-001/download_oscar_ko.py | 13 ++ ...nload_oscar_fr.sh => download_oscar_ko.sh} | 8 +- scripts/exp-001/{run_clm.sh => run_clm_ko.sh} | 25 ++-- scripts/exp-001/run_clm_no_tok.sh | 49 ------- scripts/exp-001/train_tokenizer_gpt2.py | 4 +- scripts/exp-002/eval_flue_cls.py | 111 --------------- scripts/exp-002/eval_flue_cls.sh | 47 ------- scripts/exp-002/eval_flue_cls_books.py | 125 ----------------- scripts/exp-002/eval_flue_cls_books.sh | 48 ------- scripts/exp-002/eval_flue_cls_dvd.py | 125 ----------------- scripts/exp-002/eval_flue_cls_dvd.sh | 48 ------- scripts/exp-002/eval_flue_cls_music.py | 127 ----------------- scripts/exp-002/eval_flue_cls_music.sh | 48 ------- scripts/exp-002/eval_flue_paws.py | 103 -------------- scripts/exp-002/eval_flue_paws.sh | 46 ------ scripts/exp-002/eval_flue_xnli.py | 132 ------------------ scripts/exp-002/eval_flue_xnli.sh | 47 ------- scripts/exp-002/gpt2_eval_flue_cls.sh | 44 ------ scripts/exp-002/gpt2_eval_flue_cls_books.sh | 46 ------ scripts/exp-002/gpt2_eval_flue_cls_dvd.sh | 47 ------- scripts/exp-002/gpt2_eval_flue_cls_music.sh | 46 ------ scripts/exp-002/gpt2_eval_flue_paws.sh | 44 ------ scripts/exp-002/gpt2_eval_flue_xnli.sh | 47 ------- scripts/exp-004/download_pawsx.py | 9 -- scripts/exp-004/download_pawsx.sh | 30 ---- scripts/exp-004/eval_paws_en.py | 106 -------------- scripts/exp-004/eval_paws_en.sh | 63 --------- scripts/exp-004/eval_paws_fr.py | 108 -------------- scripts/exp-004/eval_paws_fr_ft.sh | 46 ------ scripts/exp-004/eval_paws_fr_no_ft.sh | 46 ------ .../exp-004/eval_paws_fr_swapped_embedding.py | 117 ---------------- .../eval_paws_fr_swapped_embedding_ft.sh | 48 ------- .../eval_paws_fr_swapped_embedding_no_ft.sh | 48 ------- 34 files changed, 32 insertions(+), 1978 deletions(-) delete mode 100644 scripts/exp-001/download_oscar_fr.py create mode 100644 scripts/exp-001/download_oscar_ko.py rename scripts/exp-001/{download_oscar_fr.sh => download_oscar_ko.sh} (84%) rename scripts/exp-001/{run_clm.sh => run_clm_ko.sh} (71%) delete mode 100644 scripts/exp-001/run_clm_no_tok.sh delete mode 100644 scripts/exp-002/eval_flue_cls.py delete mode 100644 scripts/exp-002/eval_flue_cls.sh delete mode 100644 scripts/exp-002/eval_flue_cls_books.py delete mode 100644 scripts/exp-002/eval_flue_cls_books.sh delete mode 100644 scripts/exp-002/eval_flue_cls_dvd.py delete mode 100644 scripts/exp-002/eval_flue_cls_dvd.sh delete mode 100644 scripts/exp-002/eval_flue_cls_music.py delete mode 100644 scripts/exp-002/eval_flue_cls_music.sh delete mode 100644 scripts/exp-002/eval_flue_paws.py delete mode 100644 scripts/exp-002/eval_flue_paws.sh delete mode 100644 scripts/exp-002/eval_flue_xnli.py delete mode 100644 scripts/exp-002/eval_flue_xnli.sh delete mode 100644 scripts/exp-002/gpt2_eval_flue_cls.sh delete mode 100644 scripts/exp-002/gpt2_eval_flue_cls_books.sh delete mode 100644 scripts/exp-002/gpt2_eval_flue_cls_dvd.sh delete mode 100644 scripts/exp-002/gpt2_eval_flue_cls_music.sh delete mode 100644 scripts/exp-002/gpt2_eval_flue_paws.sh delete mode 100644 scripts/exp-002/gpt2_eval_flue_xnli.sh delete mode 100644 scripts/exp-004/download_pawsx.py delete mode 100644 scripts/exp-004/download_pawsx.sh delete mode 100644 scripts/exp-004/eval_paws_en.py delete mode 100644 scripts/exp-004/eval_paws_en.sh delete mode 100644 scripts/exp-004/eval_paws_fr.py delete mode 100644 scripts/exp-004/eval_paws_fr_ft.sh delete mode 100644 scripts/exp-004/eval_paws_fr_no_ft.sh delete mode 100644 scripts/exp-004/eval_paws_fr_swapped_embedding.py delete mode 100644 scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh delete mode 100644 scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh diff --git a/scripts/exp-001/download_oscar_fr.py b/scripts/exp-001/download_oscar_fr.py deleted file mode 100644 index fd04779..0000000 --- a/scripts/exp-001/download_oscar_fr.py +++ /dev/null @@ -1,9 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") -print("Done") \ No newline at end of file diff --git a/scripts/exp-001/download_oscar_ko.py b/scripts/exp-001/download_oscar_ko.py new file mode 100644 index 0000000..6a4354f --- /dev/null +++ b/scripts/exp-001/download_oscar_ko.py @@ -0,0 +1,13 @@ +from datasets import load_dataset +from dotenv import load_dotenv +import os +from pathlib import Path + +load_dotenv(str(Path.home() / ".env")) + +dataset = load_dataset("oscar", "unshuffled_deduplicated_ko", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_ko") + +from datasets.filesystems import S3FileSystem +s3 = S3FileSystem(key="AKIAWN4PGMXV32L5MW5B", secret="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") +dataset.save_to_disk('s3://bigscience-add-lang/oscar_ko', fs=s3) +print("Done") \ No newline at end of file diff --git a/scripts/exp-001/download_oscar_fr.sh b/scripts/exp-001/download_oscar_ko.sh similarity index 84% rename from scripts/exp-001/download_oscar_fr.sh rename to scripts/exp-001/download_oscar_ko.sh index a558c9e..f6bff6d 100644 --- a/scripts/exp-001/download_oscar_fr.sh +++ b/scripts/exp-001/download_oscar_ko.sh @@ -12,11 +12,11 @@ #SBATCH --mem=50g # Specify a job name: -#SBATCH -J exp-031-download_oscar_fr +#SBATCH -J exp-031-download_oscar_ko # Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_fr.err +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_ko.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_ko.err # Set up the environment by loading modules # Set up the environment by loading modules @@ -27,4 +27,4 @@ set +a module load python/3.7.4 source $FP_BIGS/env_lang_mod/bin/activate -python3 $FP_BIGS/scripts/exp-001/download_oscar_fr.py \ No newline at end of file +python3 $FP_BIGS/scripts/exp-001/download_oscar_ko.py \ No newline at end of file diff --git a/scripts/exp-001/run_clm.sh b/scripts/exp-001/run_clm_ko.sh similarity index 71% rename from scripts/exp-001/run_clm.sh rename to scripts/exp-001/run_clm_ko.sh index a88ff21..60fb27e 100644 --- a/scripts/exp-001/run_clm.sh +++ b/scripts/exp-001/run_clm_ko.sh @@ -1,23 +1,23 @@ #!/bin/bash # Request half an hour of runtime: -#SBATCH --time=5-23:59:00 +#SBATCH --time=6-23:59:00 # Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:8 +#SBATCH --partition=gpu-he --gres=gpu:4 # Default resources are 1 core with 2.8GB of memory. #SBATCH --ntasks=16 # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g +#SBATCH --mem=50g # Specify a job name: -#SBATCH -J exp-001-run_clm +#SBATCH -J exp-001-run_clm_ko # Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm.err +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_ko.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_ko.err # Set up the environment by loading modules set -a # automatically export all variables @@ -25,20 +25,21 @@ source ~/.env set +a module load python/3.7.4 +module load gitlfs/2.7.1 source $FP_BIGS/env_lang_mod/bin/activate -tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" -cache_dir="${FP_BIGS}/data/external/oscar_fr" -output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2" -logging_dir="${FP_BIGS}/reports/exp-001/ft-gpt2-2" -ckpt_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-2/checkpoint-25000" +tokenizer_dir="yongzx/gpt2-finetuned-oscar-ko" +cache_dir="${FP_BIGS}/data/external/oscar_ko" +output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-ko" +logging_dir="${FP_BIGS}/reports/exp-001/ft-gpt2-ko" +ckpt_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-ko/checkpoint-195500" python $FP_BIGS/scripts/exp-001/run_clm.py \ --model_name_or_path gpt2 \ --tokenizer_name $tokenizer_dir \ --dataset_name oscar \ --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_fr \ + --dataset_config_name unshuffled_deduplicated_ko \ --logging_dir $logging_dir \ --report_to "tensorboard" \ --learning_rate 0.001 \ diff --git a/scripts/exp-001/run_clm_no_tok.sh b/scripts/exp-001/run_clm_no_tok.sh deleted file mode 100644 index af2be7f..0000000 --- a/scripts/exp-001/run_clm_no_tok.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=5-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:8 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-001-run_clm_no_tok - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_no_tok.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -tokenizer_dir="${FP_BIGS}/data/processed/exp-001/oscar-fr-tokenizer" -cache_dir="${FP_BIGS}/data/external/oscar_fr" -output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-no-tok" - -python $FP_BIGS/scripts/exp-001/run_clm.py \ - --model_name_or_path gpt2 \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_fr \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 500 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ No newline at end of file diff --git a/scripts/exp-001/train_tokenizer_gpt2.py b/scripts/exp-001/train_tokenizer_gpt2.py index d8fe237..20e6555 100644 --- a/scripts/exp-001/train_tokenizer_gpt2.py +++ b/scripts/exp-001/train_tokenizer_gpt2.py @@ -5,7 +5,7 @@ load_dotenv(str(Path.home() / ".env")) -dataset = load_dataset("oscar", "unshuffled_deduplicated_fr", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_fr") +dataset = load_dataset("oscar", "unshuffled_deduplicated_ko", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_ko") def batch_iterator(): batch_size = 1000 @@ -17,4 +17,4 @@ def batch_iterator(): tokenizer = AutoTokenizer.from_pretrained("gpt2") assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) -new_tokenizer.save_pretrained(f"{os.getenv('FP_BIGS')}/data/processed/exp-001/oscar-fr-tokenizer") \ No newline at end of file +new_tokenizer.save_pretrained(f"{os.getenv('FP_BIGS')}/data/processed/exp-001/oscar-ko-tokenizer") \ No newline at end of file diff --git a/scripts/exp-002/eval_flue_cls.py b/scripts/exp-002/eval_flue_cls.py deleted file mode 100644 index a0fe4c0..0000000 --- a/scripts/exp-002/eval_flue_cls.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("flue", "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("flue", "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-002/eval_flue_cls.sh b/scripts/exp-002/eval_flue_cls.sh deleted file mode 100644 index 00adecd..0000000 --- a/scripts/exp-002/eval_flue_cls.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" - # MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - MODEL_NAME="gpt-2" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-002/eval_flue_cls_books.py b/scripts/exp-002/eval_flue_cls_books.py deleted file mode 100644 index 141f579..0000000 --- a/scripts/exp-002/eval_flue_cls_books.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -print("Before splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -# split: books -cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="books") -cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="books") -cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="books") - -print("After splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-002/eval_flue_cls_books.sh b/scripts/exp-002/eval_flue_cls_books.sh deleted file mode 100644 index 15a064a..0000000 --- a/scripts/exp-002/eval_flue_cls_books.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls_books - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_books.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change books - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - # change books - python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-002/eval_flue_cls_dvd.py b/scripts/exp-002/eval_flue_cls_dvd.py deleted file mode 100644 index 0b8f675..0000000 --- a/scripts/exp-002/eval_flue_cls_dvd.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -print("Before splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -# split: dvd -cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="dvd") -cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="dvd") -cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="dvd") - -print("After splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-002/eval_flue_cls_dvd.sh b/scripts/exp-002/eval_flue_cls_dvd.sh deleted file mode 100644 index c6c3f39..0000000 --- a/scripts/exp-002/eval_flue_cls_dvd.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls_dvd - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_dvd.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change dvd - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - # change dvd - python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-002/eval_flue_cls_music.py b/scripts/exp-002/eval_flue_cls_music.py deleted file mode 100644 index 721843c..0000000 --- a/scripts/exp-002/eval_flue_cls_music.py +++ /dev/null @@ -1,127 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -cls_train_datasetdict = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split=f"train", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls").train_test_split(train_size=0.8, shuffle=True, seed=42) -cls_train_dataset = cls_train_datasetdict['train'] -cls_val_dataset = cls_train_datasetdict['test'] -cls_test_dataset = load_dataset("/users/zyong2/data/zyong2/bigscience/notebooks/nb-002/flue", - "CLS", - split="test", - cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue-cls") # "PAWS-X", "XNLI", "CLS", "WSD-V" - -print("Before splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -# split: music -cls_train_dataset = cls_train_dataset.filter(lambda x:x['category']=="music") -cls_val_dataset = cls_val_dataset.filter(lambda x:x['category']=="music") -cls_test_dataset = cls_test_dataset.filter(lambda x:x['category']=="music") - -print("After splitting:") -print(cls_train_dataset) -print(cls_val_dataset) -print(cls_test_dataset) - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = cls_train_dataset.map(tokenize_function, batched=True) -full_val_dataset = cls_val_dataset.map(tokenize_function, batched=True) -full_test_dataset = cls_test_dataset.map(tokenize_function, batched=True) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(10)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(10)) - - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-002/eval_flue_cls_music.sh b/scripts/exp-002/eval_flue_cls_music.sh deleted file mode 100644 index fd29ce9..0000000 --- a/scripts/exp-002/eval_flue_cls_music.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_cls_music - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_cls_music.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change music - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - # change music - python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-002/eval_flue_paws.py b/scripts/exp-002/eval_flue_paws.py deleted file mode 100644 index c33649a..0000000 --- a/scripts/exp-002/eval_flue_paws.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -paws_dataset = load_dataset("flue", "PAWS-X", cache_dir="/users/zyong2/data/zyong2/bigscience/data/external/flue") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -import torch -import numpy as np -from transformers import Trainer, TrainingArguments -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -if args.do_train: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-002/eval_flue_paws.sh b/scripts/exp-002/eval_flue_paws.sh deleted file mode 100644 index 8644967..0000000 --- a/scripts/exp-002/eval_flue_paws.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_paws - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_paws.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 5e-6 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-002/eval_flue_xnli.py b/scripts/exp-002/eval_flue_xnli.py deleted file mode 100644 index b2da78b..0000000 --- a/scripts/exp-002/eval_flue_xnli.py +++ /dev/null @@ -1,132 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric - - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -# load dataset -xnli_dataset = load_dataset("flue", "XNLI", cache_dir=args.cache_dir) -xnli_train_dataset = xnli_dataset['train'] -xnli_val_dataset = xnli_dataset['validation'] -xnli_test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypo"]}', padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = xnli_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = xnli_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = xnli_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -metric = load_metric("accuracy") -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -if args.do_train: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - - model = GPT2ForSequenceClassification.from_pretrained(evaluation_dirs[-1], - num_labels=3, - pad_token_id=0) - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_model = evaluation_dirs[-1] - logging.info(f"Loading trained model from {evaluation_dirs[-1]}") - - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0) - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-002/eval_flue_xnli.sh b/scripts/exp-002/eval_flue_xnli.sh deleted file mode 100644 index 619b856..0000000 --- a/scripts/exp-002/eval_flue_xnli.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=6-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_xnli - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-110500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-002/gpt2_eval_flue_cls.sh b/scripts/exp-002/gpt2_eval_flue_cls.sh deleted file mode 100644 index 6cf0746..0000000 --- a/scripts/exp-002/gpt2_eval_flue_cls.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_cls.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $MODEL_NAME -done diff --git a/scripts/exp-002/gpt2_eval_flue_cls_books.sh b/scripts/exp-002/gpt2_eval_flue_cls_books.sh deleted file mode 100644 index 135e5fb..0000000 --- a/scripts/exp-002/gpt2_eval_flue_cls_books.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls_books - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_books.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change books - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-books-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - # change books - python $FP_BIGS/scripts/exp-002/eval_flue_cls_books.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" -done diff --git a/scripts/exp-002/gpt2_eval_flue_cls_dvd.sh b/scripts/exp-002/gpt2_eval_flue_cls_dvd.sh deleted file mode 100644 index 2507391..0000000 --- a/scripts/exp-002/gpt2_eval_flue_cls_dvd.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls_dvd - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_dvd.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change dvd - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-dvd-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - # change dvd - python $FP_BIGS/scripts/exp-002/eval_flue_cls_dvd.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" - -done diff --git a/scripts/exp-002/gpt2_eval_flue_cls_music.sh b/scripts/exp-002/gpt2_eval_flue_cls_music.sh deleted file mode 100644 index 8b4fc4b..0000000 --- a/scripts/exp-002/gpt2_eval_flue_cls_music.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_cls_music - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_cls_music.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # change music - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-cls-music-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - # change music - python $FP_BIGS/scripts/exp-002/eval_flue_cls_music.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" -done diff --git a/scripts/exp-002/gpt2_eval_flue_paws.sh b/scripts/exp-002/gpt2_eval_flue_paws.sh deleted file mode 100644 index 7ebb253..0000000 --- a/scripts/exp-002/gpt2_eval_flue_paws.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-002-gpt2_eval_flue_paws - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/gpt2_eval_flue_paws.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -# learning_rates=( 5e-6 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-paws-gpt2-base/$lr" - MODEL_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_paws.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer "gpt2" -done diff --git a/scripts/exp-002/gpt2_eval_flue_xnli.sh b/scripts/exp-002/gpt2_eval_flue_xnli.sh deleted file mode 100644 index 928a3fb..0000000 --- a/scripts/exp-002/gpt2_eval_flue_xnli.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=6-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-002-eval_flue_xnli - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/002/eval_flue_xnli.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-002/flue-xnli-gpt2-tok/$lr" - MODEL_NAME="gpt2" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-002/eval_flue_xnli.py $OUTPUT_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/exp-004/download_pawsx.py b/scripts/exp-004/download_pawsx.py deleted file mode 100644 index 2d9e976..0000000 --- a/scripts/exp-004/download_pawsx.py +++ /dev/null @@ -1,9 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("paws-x", 'fr', cache_dir=f"{os.getenv('FP_BIGS')}/data/external/paws-x") -print("Done") \ No newline at end of file diff --git a/scripts/exp-004/download_pawsx.sh b/scripts/exp-004/download_pawsx.sh deleted file mode 100644 index aa9e806..0000000 --- a/scripts/exp-004/download_pawsx.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=3-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-004-download_pawsx - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/download_pawsx.err - -# Set up the environment by loading modules -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -python3 $FP_BIGS/scripts/exp-004/download_pawsx.py \ No newline at end of file diff --git a/scripts/exp-004/eval_paws_en.py b/scripts/exp-004/eval_paws_en.py deleted file mode 100644 index 7522436..0000000 --- a/scripts/exp-004/eval_paws_en.py +++ /dev/null @@ -1,106 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - -from datasets import load_dataset -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset - -paws_dataset = load_dataset("paws-x", 'en', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -import torch -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from transformers import TrainingArguments - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -from transformers import Trainer -from datasets import load_metric -import numpy as np - -metric = load_metric("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -if args.do_train: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-004/eval_paws_en.sh b/scripts/exp-004/eval_paws_en.sh deleted file mode 100644 index 9496863..0000000 --- a/scripts/exp-004/eval_paws_en.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_en - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_en.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -# for lr in ${learning_rates[@]} ; do -# echo "LR ===== $lr" -# OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" -# MODEL_NAME="gpt2" -# TOKENIZER_NAME="gpt2" -# mkdir -p $OUTPUT_DIR - -# python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ -# --num_train_epochs 30 \ -# --learning_rate $lr \ -# --per_device_train_batch_size 4 \ -# --gradient_accumulation_steps 4 \ -# --pretrained_model $MODEL_NAME \ -# --tokenizer $TOKENIZER_NAME \ -# --do_train -# done - -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_en.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_predict -done \ No newline at end of file diff --git a/scripts/exp-004/eval_paws_fr.py b/scripts/exp-004/eval_paws_fr.py deleted file mode 100644 index 180c03e..0000000 --- a/scripts/exp-004/eval_paws_fr.py +++ /dev/null @@ -1,108 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -logging.info("Load Tokenizer") -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -logging.info("Load Raw Dataset") -paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -logging.info("Load Dataset Ready for Training") -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logging.info("Load Metric") -from datasets import load_metric -metric = load_metric("accuracy") -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-004/eval_paws_fr_ft.sh b/scripts/exp-004/eval_paws_fr_ft.sh deleted file mode 100644 index 8f14a14..0000000 --- a/scripts/exp-004/eval_paws_fr_ft.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-004/eval_paws_fr_no_ft.sh b/scripts/exp-004/eval_paws_fr_no_ft.sh deleted file mode 100644 index 5a57c6b..0000000 --- a/scripts/exp-004/eval_paws_fr_no_ft.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_no_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_no_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-base/$lr" - MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr/checkpoint-92610" - TOKENIZER_NAME="gpt2" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_predict -done diff --git a/scripts/exp-004/eval_paws_fr_swapped_embedding.py b/scripts/exp-004/eval_paws_fr_swapped_embedding.py deleted file mode 100644 index 3199a41..0000000 --- a/scripts/exp-004/eval_paws_fr_swapped_embedding.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -# setup logging -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s ====== %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.INFO, -) -logging.getLogger().addHandler(logging.StreamHandler()) - - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--fr_gpt2_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -args = parser.parse_args() -assert args.do_train ^ args.do_predict # current code doesnt allow do_train followed by do_predict - - -from datasets import load_dataset -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -logging.info("Load Tokenizer") -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer) - -logging.info("Load Raw Dataset") -paws_dataset = load_dataset("paws-x", 'fr', cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/paws-x") -paws_train_dataset = paws_dataset['train'] -paws_val_dataset = paws_dataset['validation'] -paws_test_dataset = paws_dataset['test'] - -def tokenize_function(examples): - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', padding="max_length", truncation=True) - -logging.info("Load Dataset Ready for Training") -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = paws_train_dataset.map(tokenize_function, batched=False) -full_val_dataset = paws_val_dataset.map(tokenize_function, batched=False) -full_test_dataset = paws_test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logging.info("Load Metric") -from datasets import load_metric -metric = load_metric("accuracy") -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=2, - pad_token_id=0) - -fr_model = GPT2ForSequenceClassification.from_pretrained(args.fr_gpt2_model, - num_labels=2, - pad_token_id=0) - -# swapped the embedding layers -model.transformer.wte.weight = fr_model.transformer.wte.weight -model.transformer.wpe.weight = fr_model.transformer.wpe.weight - -if args.do_train: - logging.info("Start Training") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=full_train_dataset, - eval_dataset=full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - logging.info("Start Evaluation") - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate:", trainer.evaluate()) - - diff --git a/scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh b/scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh deleted file mode 100644 index b177cc9..0000000 --- a/scripts/exp-004/eval_paws_fr_swapped_embedding_ft.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-fr-gpt2-rp-embedding/$lr" - EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" - FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $EN_MODEL_NAME \ - --fr_gpt2_model $FR_MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train -done diff --git a/scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh b/scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh deleted file mode 100644 index 6af8422..0000000 --- a/scripts/exp-004/eval_paws_fr_swapped_embedding_no_ft.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-004-eval_paws_fr_swapped_embedding_no_ft - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/004/eval_paws_fr_swapped_embedding_no_ft.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/$lr" - EN_MODEL_NAME="$FP_BIGS/data/processed/exp-004/paws-en-gpt2-base/1e-5/checkpoint-92610" - FR_MODEL_NAME="$FP_BIGS/data/processed/exp-001/ft-gpt2-2/checkpoint-111500" - TOKENIZER_NAME="$FP_BIGS/data/processed/exp-001/oscar-fr-tokenizer" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-004/eval_paws_fr_swapped_embedding.py $OUTPUT_DIR \ - --num_train_epochs 30 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $EN_MODEL_NAME \ - --fr_gpt2_model $FR_MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_predict -done From ef390fc52d4115f561175b6977e985e5b3a1a7e3 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 27 Oct 2021 15:23:43 -0400 Subject: [PATCH 042/142] update --- scripts/README.md | 5 ++--- scripts/exp-001/download_oscar_ko.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 66c5a09..c1da51c 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,7 +1,6 @@ # Current Experiments -- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr`. -- `exp-002`: evaluate gpt-2-{finetuned on OSCAR-FR, base} on FLUE's tasks (CLS, XNLI, PAWS) -- `exp-004`: evaluate gpt-2 base and swapped-embedding-layers for PAWS-X +- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_ko`. +- `exp-002`: evaluate gpt-2-{finetuned on OSCAR-KO, base} on KLUE's tasks (CLS, XNLI, PAWS) # Carbon Tracking Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) diff --git a/scripts/exp-001/download_oscar_ko.py b/scripts/exp-001/download_oscar_ko.py index 6a4354f..534073d 100644 --- a/scripts/exp-001/download_oscar_ko.py +++ b/scripts/exp-001/download_oscar_ko.py @@ -8,6 +8,6 @@ dataset = load_dataset("oscar", "unshuffled_deduplicated_ko", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_ko") from datasets.filesystems import S3FileSystem -s3 = S3FileSystem(key="AKIAWN4PGMXV32L5MW5B", secret="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") +s3 = S3FileSystem(key="KEY", secret="VAL") dataset.save_to_disk('s3://bigscience-add-lang/oscar_ko', fs=s3) print("Done") \ No newline at end of file From 2809dd6420b886ca3bc283ca737119ebc03c4d73 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 21 Dec 2021 15:58:56 -0500 Subject: [PATCH 043/142] upload scripts --- README.md | 17 +- scripts/exp-005/download_oscar.py | 11 + scripts/exp-005/download_oscar_de.sh | 29 + scripts/exp-005/run_clm.py | 551 +++++++++++++++ scripts/exp-005/run_clm_de.sh | 56 ++ scripts/exp-005/train_tokenizer_gpt2.py | 20 + scripts/exp-005/train_tokenizer_gpt2.sh | 29 + .../GPT2ForQuestionAnswering.cpython-37.pyc | Bin 0 -> 5168 bytes .../__pycache__/trainer_qa.cpython-37.pyc | Bin 0 -> 2574 bytes scripts/exp-006/xnli/adapters_xnli_de.py | 191 +++++ scripts/exp-006/xnli/adapters_xnli_de_gpt2.sh | 58 ++ scripts/exp-006/xnli/adapters_xnli_ko.py | 234 +++++++ scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh | 57 ++ scripts/exp-006/xnli/xnli_de.py | 162 +++++ scripts/exp-006/xnli/xnli_de_ft_gpt2.sh | 53 ++ scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh | 54 ++ scripts/exp-006/xnli/xnli_de_gpt2.sh | 53 ++ scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh | 54 ++ scripts/exp-006/xnli/xnli_ko.py | 213 ++++++ scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh | 52 ++ scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh | 53 ++ ...423e05ef7d46c9ba83614b4c527017571b9d2898b8 | 124 ++++ ...5ef7d46c9ba83614b4c527017571b9d2898b8.json | 1 + ...5ef7d46c9ba83614b4c527017571b9d2898b8.lock | 0 .../exp-006/xquad/GPT2ForQuestionAnswering.py | 129 ++++ .../GPT2ForQuestionAnswering.cpython-37.pyc | Bin 0 -> 5202 bytes .../__pycache__/trainer_qa.cpython-37.pyc | Bin 0 -> 2580 bytes .../xquad/__pycache__/utils_qa.cpython-37.pyc | Bin 0 -> 13232 bytes scripts/exp-006/xquad/eval_germanquad.sh | 52 ++ scripts/exp-006/xquad/eval_qa.py | 655 ++++++++++++++++++ scripts/exp-006/xquad/trainer_qa.py | 105 +++ scripts/exp-006/xquad/utils_qa.py | 431 ++++++++++++ scripts/exp-007/madx_run_clm.py | 593 ++++++++++++++++ scripts/exp-007/run_clm_de.sh | 60 ++ scripts/exp-007/run_clm_en.sh | 61 ++ scripts/exp-007/run_clm_ko.sh | 61 ++ scripts/exp-008/xnli/xnli_de.py | 151 ++++ scripts/exp-008/xnli/xnli_de_mbert.sh | 53 ++ scripts/exp-008/xnli/xnli_de_mbert_0shot.sh | 54 ++ scripts/exp-008/xnli/xnli_ko.py | 197 ++++++ scripts/exp-008/xnli/xnli_ko_mbert.sh | 53 ++ scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh | 54 ++ 42 files changed, 4780 insertions(+), 1 deletion(-) create mode 100644 scripts/exp-005/download_oscar.py create mode 100644 scripts/exp-005/download_oscar_de.sh create mode 100644 scripts/exp-005/run_clm.py create mode 100644 scripts/exp-005/run_clm_de.sh create mode 100644 scripts/exp-005/train_tokenizer_gpt2.py create mode 100644 scripts/exp-005/train_tokenizer_gpt2.sh create mode 100644 scripts/exp-006/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc create mode 100644 scripts/exp-006/__pycache__/trainer_qa.cpython-37.pyc create mode 100644 scripts/exp-006/xnli/adapters_xnli_de.py create mode 100644 scripts/exp-006/xnli/adapters_xnli_de_gpt2.sh create mode 100644 scripts/exp-006/xnli/adapters_xnli_ko.py create mode 100644 scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh create mode 100644 scripts/exp-006/xnli/xnli_de.py create mode 100644 scripts/exp-006/xnli/xnli_de_ft_gpt2.sh create mode 100644 scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh create mode 100644 scripts/exp-006/xnli/xnli_de_gpt2.sh create mode 100644 scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh create mode 100644 scripts/exp-006/xnli/xnli_ko.py create mode 100644 scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh create mode 100644 scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh create mode 100644 scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 create mode 100644 scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json create mode 100755 scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.lock create mode 100644 scripts/exp-006/xquad/GPT2ForQuestionAnswering.py create mode 100644 scripts/exp-006/xquad/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc create mode 100644 scripts/exp-006/xquad/__pycache__/trainer_qa.cpython-37.pyc create mode 100644 scripts/exp-006/xquad/__pycache__/utils_qa.cpython-37.pyc create mode 100644 scripts/exp-006/xquad/eval_germanquad.sh create mode 100644 scripts/exp-006/xquad/eval_qa.py create mode 100644 scripts/exp-006/xquad/trainer_qa.py create mode 100644 scripts/exp-006/xquad/utils_qa.py create mode 100644 scripts/exp-007/madx_run_clm.py create mode 100644 scripts/exp-007/run_clm_de.sh create mode 100644 scripts/exp-007/run_clm_en.sh create mode 100644 scripts/exp-007/run_clm_ko.sh create mode 100644 scripts/exp-008/xnli/xnli_de.py create mode 100644 scripts/exp-008/xnli/xnli_de_mbert.sh create mode 100644 scripts/exp-008/xnli/xnli_de_mbert_0shot.sh create mode 100644 scripts/exp-008/xnli/xnli_ko.py create mode 100644 scripts/exp-008/xnli/xnli_ko_mbert.sh create mode 100644 scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh diff --git a/README.md b/README.md index 7f146a7..11f32dd 100644 --- a/README.md +++ b/README.md @@ -1 +1,16 @@ -# multilingual-modeling \ No newline at end of file +### Previous Experiments +- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr` and `unshuffled_dudplicated_kr`. +- `exp-002`: evaluate gpt-2 on FLUE's tasks (CLS, XNLI, PAWS) +- `exp-003`: TODO: evaluate on multiatis +- `exp-004`: Does the embedding layer learn anything useful? Take a dataset in English for PAWS-X, finetune GPT-2 on this dataset, evaluate it on English test set T_e. Then, take the same test-set T_e translated in French (T_f), take GPT-2 parameters fine-tuned for the task X, replace English embeddings with French embeddings and evaluate thus obtained model on French test set. + +# Experiment folders below after Conversation with Vassilina, Hady, Iz, and Maruf [Link](https://huggingface.slack.com/archives/C020G6A9KHQ/p1637023149074800) +- `exp-005`: cleaned from `exp-001` for finetuning GPT-2 embedding layers for DE and KO on Oscar. +- `exp-006`: run zero-shot and finetuned evaluation setting for XNLI ✅, PAWS ❌, and XQuAD ❌. (❌ means not done. ✅ means done.) +- `exp-007`: apply MAD-X adapter method. [Paper link](https://arxiv.org/abs/2005.00052) +- `exp-008`: from exp-006, but using mBERT on the zero-shot and finetuning setting. + + +# Carbon Tracking +Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) + diff --git a/scripts/exp-005/download_oscar.py b/scripts/exp-005/download_oscar.py new file mode 100644 index 0000000..4a2f504 --- /dev/null +++ b/scripts/exp-005/download_oscar.py @@ -0,0 +1,11 @@ +from datasets import load_dataset +import os +from pathlib import Path +import argparse + +parser = argparse.ArgumentParser(description='') +parser.add_argument('lang', type=str, help='language subset') +args = parser.parse_args() + +dataset = load_dataset("oscar", f"unshuffled_deduplicated_{args.lang}", cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/oscar_{args.lang}") +print("Done") \ No newline at end of file diff --git a/scripts/exp-005/download_oscar_de.sh b/scripts/exp-005/download_oscar_de.sh new file mode 100644 index 0000000..c102b49 --- /dev/null +++ b/scripts/exp-005/download_oscar_de.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-005-download_oscar_de + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/005/download_oscar_de.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/005/download_oscar_de.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +python3 $FP_BIGS/scripts/exp-005/download_oscar.py de \ No newline at end of file diff --git a/scripts/exp-005/run_clm.py b/scripts/exp-005/run_clm.py new file mode 100644 index 0000000..396fbfa --- /dev/null +++ b/scripts/exp-005/run_clm.py @@ -0,0 +1,551 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=causal-lm +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import torch +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + model.resize_token_embeddings(len(tokenizer)) + for name, param in model.named_parameters(): + if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): + print(f"🥶 Freeze layer '{name}'") + param.requires_grad = False + else: + param.requires_grad = True + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + print("Sanity check: loaded tokenized_datasets") + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + print("Sanity check: saved tokenized_datasets") + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") + saved_lm_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + print("Sanity check: loaded lm_datasets") + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + print("Sanity check: saved lm_datasets") + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + import gc + del tokenized_datasets + gc.collect() + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + print("Checkpoint:", checkpoint) + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + if training_args.push_to_hub: + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + trainer.push_to_hub(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/scripts/exp-005/run_clm_de.sh b/scripts/exp-005/run_clm_de.sh new file mode 100644 index 0000000..fb2d291 --- /dev/null +++ b/scripts/exp-005/run_clm_de.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=6-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:4 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-005-run_clm_de + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/005/run_clm_de.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/005/run_clm_de.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_lang_mod/bin/activate + +tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/exp-005/oscar-de-tokenizer" +cache_dir="${FP_BIGS}/data/external/oscar_de" +output_dir="${FP_BIGS}/data/processed/exp-005/ft-gpt2-de" +logging_dir="${FP_BIGS}/reports/exp-005/ft-gpt2-de" + +python $FP_BIGS/scripts/exp-005/run_clm.py \ + --model_name_or_path gpt2 \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_de \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ No newline at end of file diff --git a/scripts/exp-005/train_tokenizer_gpt2.py b/scripts/exp-005/train_tokenizer_gpt2.py new file mode 100644 index 0000000..b90709a --- /dev/null +++ b/scripts/exp-005/train_tokenizer_gpt2.py @@ -0,0 +1,20 @@ +from datasets import load_dataset + +import os +from pathlib import Path + + +lang = "de" +dataset = load_dataset("oscar", f"unshuffled_deduplicated_{lang}", cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/oscar_{lang}") + +def batch_iterator(): + batch_size = 1000 + for i in range(0, len(dataset), batch_size): + yield dataset['train'][i : i + batch_size]["text"] + +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +assert tokenizer.is_fast +new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) +new_tokenizer.save_pretrained(f"/users/zyong2/data/zyong2/bigscience/data/processed/exp-005/oscar-{lang}-tokenizer") \ No newline at end of file diff --git a/scripts/exp-005/train_tokenizer_gpt2.sh b/scripts/exp-005/train_tokenizer_gpt2.sh new file mode 100644 index 0000000..a273035 --- /dev/null +++ b/scripts/exp-005/train_tokenizer_gpt2.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=3-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-005-train_tokenizer_gpt2 + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/005/train_tokenizer_gpt2.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/005/train_tokenizer_gpt2.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +python3 $FP_BIGS/scripts/exp-005/train_tokenizer_gpt2.py \ No newline at end of file diff --git a/scripts/exp-006/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc b/scripts/exp-006/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..928bfcfc22fb936c0683c62ff9f91d578603a56c GIT binary patch literal 5168 zcmd^D%WoXF8K0SbuXZKB?KEw}CP1@`>S3p74?-YUPZc^}z)$W$!z^Y8)&T!Iffd>V+s1w+aKg%<5>^LQJ+8vIHmK`< zEog+zK{IR(TDJAI#p{K|b1OZgyDB2JEX zpqn=yK7N25n9?4lk*EAPq5(|3{`I51&mIZBC#WBB_Aq8V$XobQcReIrKHZ`te+nIL z4Qyr&oNuj>&1~kpZVoD}!D_7j+8$KDwW`*jhOFgxkl1!Cew}iO+kP3z7hL$!*mDYc z_#c&2fuH;w8fCq*-rAXc3d+gsXHI6hj;dt#D;s)MReM{9w+;Mqir<=<^|p0tp|W5{ zd$-@tYcfr^$eXU~N4|31yd9;X8_<0oNYQ{T@+Bo`BuB9bxe&-fUf%%?P?0Z9e8za> zO8+I#+s~;Rrz%O6%sZOa3+|66N`mgHk_V%_?#0o_AA@FFuImL&dY|e4zoP2AbWm zn@-!l=DhsLrHe}0celM|ScuOZ_#u(|@RQ57Wo6bg&~WBtm8_c8vU=9YnprDrXG_d} zvvjhYEi>l>>tqEqY-cN{_Q{p($}@*mSe4b@IKOjFuAW@WuD*pae7l-mD|+xQyEbxQ zHGuUGK;gCpv^uPD(E6h-+F7^kscP1FTRXL0*{pI<6@P+}q1Aye-MKG+g^~Hvh5c%* z0{dNu*>&b**H0b5Nq5#)jqEyWzOhekWH*45&MW8SCiJdlUFhA)Zf3V`gByKlo!riD zkL+w&bGHoGX;BB%(keFv<$$0HewA&EysPH4i3 znb6y(%A05at@ekjWEdyL;h@j>h^B#(M8(DI-8kaI{^oR7+j_RxjpIm5#vXF z+)Pb4(aeNx*x?g|>s3D&s@_Awk3nUm|1=sqo`e z5+3l7N8l`sC!kOK1h_>onxLBsgXG?(hW6(0_j6HQuK#vtm#h0aJP-SXfM&+&BLukW36XQ8D8CwVae>>3eGni&#bl0?LZ0C{%Cz4u~81LLdw2w2wnSo|89z(2IGhwLQ zXrDvK;^c8WQV;3TTp&P4Eho)K(eG6E`*VZZD7U$kmym~3o$UBhJvoQQr`q&Iz%%R` zz^zA6L>zM=&*}nDrl5jCi}N)TdjS-tl0Y3PG{XTu)+jur0Te?3Mr~@U6kr?!@_rC| z2Xanpm-PWHfCO!)Vi z1ep5XgiPomyqs_!(hCqHWB(A)tK&T2FT^5nAuq^DoCZt>XJg2R{(=g>qeM98X$5X{0ns0>w-*R~qpQ;r zQ$;bLO-3l>P_Xul=NU632_T;0S1e)?AyZk3`E$WzipZ&(sLP`E`&eg2_J20bOfX#! z>EdfaOUAnMg&4sTzyxq9SI;Z_OsqSn^e;;v1rIXyKcOSVh1Td)E+QfBh4g6Ecq{l* z0e(HrEJVjSuxL4mO^f9^AkMbY;R6YgtuvI@kpmZE9x#FD$KIz$LB!SCK9vv# zp<$oS+`xdeAaOY}{}s&{#~6ZK3_)8Ez_2ifXbMF+lo*Bd>rkdbMC*P04G)5Nm4wHH z#bZpgUaI)VtAsMhO$qnFD_p%wYJB$l>uDsR%H=P4?{1%vPsmqT*AqM#l6zAQOsbvc z;In)@x%YIY0!)LK&-^*eoRT$OjACiMLG@n<3h-S#mu#IGuVPU!+h%KFG~-3rK3k>t zKe_vKws(IiZ%y~hS1?Doki1-zxbo#GTHTP!gM3+Q%~i(Nq6x79=>=gW$>Yt3@=Q?jrLg6Zc)_drH8$)q1C!f58I>?+Ww7cX)=!E0B&q zj-UpCgFl{@nxX^m#R@cVDUbxd${U4E^9m$*IK<;Izpl?ZuD~mZ{vB}@*WAJeE^{n_ z$7b;XHb21TLu`JC&5yA8F*YAzL!il5&R8SwF6cjR6owtFOoNVjlh=cI?BhLyp-_z4 zxZLyyT_YPa!sSggp*!+a-Y%EGT~E_2fL>c+oe7Jj6gQAT-P}Yj8&Idcpn{>0@(XCJ z`&HYy;ICQH7Y%mXZaX*Z8_pHz!4EDUI(7$I{OiE@CAst>qS&h6-7TK&?d{#&ejTr8 zA{uhMo^f43c^Y89?7B07`nI?Z-v!Dgn$V!dCEBNH zcb4E<=)yle6L4PBLBretHBA7CB8>xqcT%DTO>S@Jo%t{;>i7xoku;eZFLN4her_cC R6(UvN+5kJCx{hX&Y$OHWW4pRXrgDEmf;lN~u7iveXVU$F>3$)bHyqS6P^}c@Z<%@dVCGeg9?S+4| zM#vvHS$q}@u0xYQ0||*oLK+$|9U7i7G(D5z+>EVZ$tzL9%|jx-1&$Tl!?Hmr89H7? z@3CTcSk;`GSEuA_5|%>yh=evb4o$BSTDM8Nd~nUZ(URlsAoitfrM=d@JHn3=F7CDD zhzC(G3R+p(`eDpv7NyCh8;N|#MU?bgnJ;&x`zOdq+J-6@$J9n7S(b`mhh?KN^B?$8 z>~F`sO;z)j;9(TtHQ$e6G3y3+c=4gy_{p04DOrb>wb z11=@&jT0?tKtUP|2JLOTU*6VAh!K@R~_>U!>1VC%7u z2)WZXRW;;um%?1FgV780usDSXO_O+ERil2A3XWcpin8`;Rnt+8aUyxwep$-$iR!|;(|!&NC&p69&eM% ziX)jRON-~q%omw_7-c)k9Km@rPW#>}rinrZ7V_n3fQVhT<0oOvg;z}ztuu(m0fHHJY8gn&l)wuGS-cTapwO(y0~3g8Kmd`#^dksCd4XWQlk%2 zR)ICZ24=gw_>%P#O%4y~D`5l5GZa{yD7 z*AO?yR!Jl7DdO=J#FdtadwQ2-wY)MYL#yX*Xg@Zwd;w-|EpY*u5l~EtIDrKHs>($$ zijy#d<#&)-fr`Xq7KKuLg6zf|CTPPqy{}OrHer`IjpPgx%-{MZaTZ6IR0UQ9@jeo) zW8xB!BGi_+(RTwVRkILkS$njs^B7!Pti2J2;v>BFV>z6)}jiIvnZC@JKpB}56WKbZv~J&w_tz@@K{+m_bN+< yK;U|(j`!=-v{hhUoWnOUhQtR*Q0QZQUd;nn=k~dVc}8Nrw2UTT-iB 3.0, 1, 0) + labels = np.where(labels > 3.0, 1, 0) + # print(predictions) + # print(labels) + # assert False + + # apply metric + metric = load_metric(KLUE_TASKS[args.klue_task].metric.split("/")[0]) + if "/" in KLUE_TASKS[args.klue_task].metric: + return metric.compute(predictions=predictions, + references=labels, + average=KLUE_TASKS[args.klue_task].metric.split("/")[1]) + else: + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="steps", + save_strategy="steps", + logging_strategy="steps", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + + +def load_model(args, inference=False): + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0, + cache_dir=args.cache_dir) + if not inference: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + if args.finetune_strategies == "whole": + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "lang_adapters": + model.train_adapter([args.adapter_lang_name]) + elif args.finetune_strategies == "task_adapters": + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + else: + raise ValueError("Lack configuration") + + print(model) + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + else: + print("🔥 ==================== Inference: ==================== 🔥") + assert args.pretrained_adapters_dir + if args.finetune_strategies == "lang_adapters": + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "task_adapters": + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + print(model) + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh b/scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh new file mode 100644 index 0000000..ae705f4 --- /dev/null +++ b/scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-adapters_xnli_ko_gpt2_task_adapters + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/adapters_xnli_ko_gpt2_task_adapters.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/adapters_xnli_ko_gpt2_task_adapters.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="gpt2" + TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-ko" + LANG="ko" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/exp-007/madx-gpt2-ko/checkpoint-166500/oscar_ko" + FT_STRATEGIES="task_adapters" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/adapters_xnli_${LANG}_gpt2_lr-${lr}_strategy-${FT_STRATEGIES}" + CACHE_DIR="$FP_BIGS/data/external/hf" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/adapters_xnli_ko.py $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-ko" \ + --finetune_strategies $FT_STRATEGIES +done diff --git a/scripts/exp-006/xnli/xnli_de.py b/scripts/exp-006/xnli/xnli_de.py new file mode 100644 index 0000000..b7b9ace --- /dev/null +++ b/scripts/exp-006/xnli/xnli_de.py @@ -0,0 +1,162 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +print("Arguments: ========") +print(args) + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +if args.zero_shot: + en_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir=args.cache_dir) + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer.pad_token = en_tokenizer.eos_token + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="steps", + save_strategy="steps", + logging_strategy="steps", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args): + return GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0, + cache_dir=args.cache_dir) + + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + + model = load_model(args) + training_args.report_to = list() + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-006/xnli/xnli_de_ft_gpt2.sh b/scripts/exp-006/xnli/xnli_de_ft_gpt2.sh new file mode 100644 index 0000000..b2b344d --- /dev/null +++ b/scripts/exp-006/xnli/xnli_de_ft_gpt2.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-xnli_de_ft_gpt2 + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="yongzx/gpt2-finetuned-oscar-de" + TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-de" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_${lr}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh b/scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh new file mode 100644 index 0000000..8fe2699 --- /dev/null +++ b/scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-xnli_de_ft_gpt2_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="gpt2" + TOKENIZER_NAME="gpt2" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_gpt2_0shot_$lr" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --zero_shot +done diff --git a/scripts/exp-006/xnli/xnli_de_gpt2.sh b/scripts/exp-006/xnli/xnli_de_gpt2.sh new file mode 100644 index 0000000..ee7427c --- /dev/null +++ b/scripts/exp-006/xnli/xnli_de_gpt2.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-xnli_de_gpt2 + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="gpt2" + TOKENIZER_NAME="gpt2" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_gpt2_${lr}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh b/scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh new file mode 100644 index 0000000..012da74 --- /dev/null +++ b/scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-xnli_de_gpt2_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="yongzx/gpt2-finetuned-oscar-de" + TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-de" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_0shot_$lr" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --zero_shot +done diff --git a/scripts/exp-006/xnli/xnli_ko.py b/scripts/exp-006/xnli/xnli_ko.py new file mode 100644 index 0000000..37f9c3c --- /dev/null +++ b/scripts/exp-006/xnli/xnli_ko.py @@ -0,0 +1,213 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import GPT2Tokenizer, GPT2ForSequenceClassification + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +KLUE = namedtuple("KLUE", ["klue_split", "num_labels", "metric", "model_type"]) +KLUE_TASKS = { + "topic-cls": KLUE(klue_split="ynat", num_labels=7, metric="f1/macro", model_type="seq-cls"), + "sts-pearsonr": KLUE(klue_split="sts", num_labels=1, metric="pearsonr", model_type="seq-cls"), + "sts-binary": KLUE(klue_split="sts", num_labels=1, metric="f1/macro", model_type="seq-cls"), + "nli": KLUE(klue_split="nli", num_labels=3, metric="accuracy", model_type="seq-cls"), +} + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--klue_task", choices=KLUE_TASKS.keys(), default="nli") +parser.add_argument("--lang", type=str, default="ko") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +print("Arguments: ========") +print(args) + +# load dataset +klue_dataset = load_dataset("klue", KLUE_TASKS[args.klue_task].klue_split, cache_dir=args.cache_dir) +if args.zero_shot: + print("0️⃣ 0-Shot") + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + + if "test" not in klue_dataset: + _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = klue_dataset['validation'] + else: + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = klue_dataset['test'] +else: + print("👀 Supervised Training") + if "test" not in klue_dataset: + _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) + train_dataset = _train_dataset['train'] + val_dataset = _train_dataset['test'] + test_dataset = klue_dataset['validation'] + else: + train_dataset = klue_dataset['train'] + val_dataset = klue_dataset['validation'] + test_dataset = klue_dataset['test'] + + +# load tokenizer +tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +if args.zero_shot: + en_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir=args.cache_dir) + +def tokenize_function(examples): + if KLUE_TASKS[args.klue_task].klue_split == "ynat": + return tokenizer(examples["title"], max_length=128, padding="max_length", truncation=True) + elif KLUE_TASKS[args.klue_task].klue_split == "sts": + return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', max_length=128, padding="max_length", truncation=True) + elif KLUE_TASKS[args.klue_task].klue_split == "nli": + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + if KLUE_TASKS[args.klue_task].klue_split == "ynat": + return en_tokenizer(examples["title"], max_length=128, padding="max_length", truncation=True) + elif KLUE_TASKS[args.klue_task].klue_split == "sts": + return en_tokenizer(f'{examples["sentence1"]} {en_tokenizer.eos_token} {examples["sentence2"]}', max_length=128, padding="max_length", truncation=True) + elif KLUE_TASKS[args.klue_task].klue_split == "nli": + return en_tokenizer(f'{examples["premise"]} {en_tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +def postprocessing(example): + if KLUE_TASKS[args.klue_task].klue_split == "sts": + example['labels'] = example['labels']['real-label'] + return example + else: + return example + +logger.info("Tokenizing the dataset...") +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer.pad_token = en_tokenizer.eos_token + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False).map(postprocessing) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False).map(postprocessing) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False).map(postprocessing) + full_val_dataset = val_dataset.map(tokenize_function, batched=False).map(postprocessing) +full_test_dataset = test_dataset.map(tokenize_function, batched=False).map(postprocessing) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +def compute_metrics(eval_pred): + logits, labels = eval_pred + + if "pearsonr" in KLUE_TASKS[args.klue_task].metric: + predictions = logits.flatten() + else: + predictions = np.argmax(logits, axis=-1) + + ### only for STS-binary + if args.klue_task == "sts-binary": + predictions = np.where(logits.flatten() > 3.0, 1, 0) + labels = np.where(labels > 3.0, 1, 0) + # print(predictions) + # print(labels) + # assert False + + # apply metric + metric = load_metric(KLUE_TASKS[args.klue_task].metric.split("/")[0]) + if "/" in KLUE_TASKS[args.klue_task].metric: + return metric.compute(predictions=predictions, + references=labels, + average=KLUE_TASKS[args.klue_task].metric.split("/")[1]) + else: + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + + +def load_model(args): + if KLUE_TASKS[args.klue_task].model_type == "seq-cls": + return GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=KLUE_TASKS[args.klue_task].num_labels, + pad_token_id=0, + cache_dir=args.cache_dir) + + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args) + training_args.report_to = list() + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh b/scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh new file mode 100644 index 0000000..648a4c5 --- /dev/null +++ b/scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-xnli_ko_ft_gpt2 + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="yongzx/gpt2-finetuned-oscar-ko" + TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-ko" + LANG="ko" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_${lr}" + CACHE_DIR="$FP_BIGS/data/external/hf" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/xnli_ko.py $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh b/scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh new file mode 100644 index 0000000..8e59776 --- /dev/null +++ b/scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-xnli_ko_ft_gpt2_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="yongzx/gpt2-finetuned-oscar-ko" + TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-ko" + LANG="ko" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_0shot_${lr}" + CACHE_DIR="$FP_BIGS/data/external/hf" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xnli/xnli_ko.py $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --zero_shot +done diff --git a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 new file mode 100644 index 0000000..ddeddb4 --- /dev/null +++ b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 @@ -0,0 +1,124 @@ +{ + "lingaccept": { + "cola": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_lingaccept_cola_pfeiffer.json" + } + }, + "default": "adapters/ukp/gpt2_lingaccept_cola_pfeiffer.json" + } + }, + "lm": { + "poem": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_lm_poem_pfeiffer.json" + } + }, + "default": "adapters/ukp/gpt2_lm_poem_pfeiffer.json" + } + }, + "nli": { + "multinli": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_nli_multinli_pfeiffer.json" + } + }, + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_nli_multinli_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_nli_multinli_pfeiffer.json" + }, + "qnli": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_nli_qnli_pfeiffer.json" + } + }, + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_nli_qnli_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_nli_qnli_pfeiffer.json" + }, + "rte": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_nli_rte_pfeiffer.json" + } + }, + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_nli_rte_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_nli_rte_pfeiffer.json" + } + }, + "sentiment": { + "sst-2": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_sentiment_sst-2_pfeiffer.json" + } + }, + "default": "adapters/ukp/gpt2_sentiment_sst-2_pfeiffer.json" + } + }, + "sentiment ": { + "sst-2": { + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_sentiment_sst-2_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_sentiment_sst-2_houlsby.json" + } + }, + "sts": { + "mrpc": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_sts_mrpc_pfeiffer.json" + } + }, + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_sts_mrpc_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_sts_mrpc_pfeiffer.json" + }, + "qqp": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_sts_qqp_pfeiffer.json" + } + }, + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_sts_qqp_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_sts_qqp_pfeiffer.json" + }, + "sts-b": { + "9076f36a74755ac4": { + "versions": { + "ukp": "adapters/ukp/gpt2_sts_sts-b_pfeiffer.json" + } + }, + "b1017368d7a97b11": { + "versions": { + "ukp": "adapters/ukp/gpt2_sts_sts-b_houlsby.json" + } + }, + "default": "adapters/ukp/gpt2_sts_sts-b_pfeiffer.json" + } + } +} \ No newline at end of file diff --git a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json new file mode 100644 index 0000000..857f977 --- /dev/null +++ b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json @@ -0,0 +1 @@ +{"url": "https://raw.githubusercontent.com/Adapter-Hub/Hub/master/dist/v2/index/gpt2.json", "etag": "W/\"b52e703eb3a35aeb1ba6beaec6300d7d265a1a585fe67273fa7c6e4622a63cd4\""} \ No newline at end of file diff --git a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.lock b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.lock new file mode 100755 index 0000000..e69de29 diff --git a/scripts/exp-006/xquad/GPT2ForQuestionAnswering.py b/scripts/exp-006/xquad/GPT2ForQuestionAnswering.py new file mode 100644 index 0000000..bb91cd6 --- /dev/null +++ b/scripts/exp-006/xquad/GPT2ForQuestionAnswering.py @@ -0,0 +1,129 @@ +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import functional as F + +from transformers import GPT2PreTrainedModel, GPT2Model + + +# warnings can be ignored: +# https://discuss.huggingface.co/t/gpt2lmheadmodel-from-pretrained-gpt2-not-loading-attn-weights/432 + +class GPT2ForQuestionAnswering(GPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = GPT2Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + self.is_parallelizable = False + + self.init_weights() + + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + use_cache=False, + output_attentions=False, + return_dict=False, + start_positions=None, + end_positions=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple + import torch + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') + model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss = outputs[0] + + """ + + outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + input_mask=input_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + return_dict=return_dict + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) diff --git a/scripts/exp-006/xquad/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc b/scripts/exp-006/xquad/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f15d2f948c6dc94e3d74735b459c40c14f0a13c4 GIT binary patch literal 5202 zcmd^DNpBpv6|Sn@Tiuc^uaikKoC%QWMzq)yU=BhQ%-FDI4zl3HIt1+))$Ag>yR6zw zvSdjq ze)qRO)4O*q>+d+2JZ0#71^;p%8fGy&uzL9G1XgJGY#aNfzzIveQdsVl^|%b zm7o^Zd-brGJZ{PYoaU`lhCMxOHHhz2nA#y3y)zjz|}zMy`@+2fe;AaCGH-Sv?0SUpSfNI;xb}uWjg+Rpnh3-q!HT8GfrXeBa2dcg-^k zRR;Sydpq5wydu+ti@ff-e&j3H&6`mgx&a;VK#CgdlrJekBRPyk$b~>g^6D-qf{J`; z>@&tASNbPBZ@#2%oT?;MGQaLiH=%+C0T28WItX~)(&S%pe>7GSv|5%t80J+kj)wjS zblz}XFQ8Jo?q95b>~D^eq1s^-gxumIF3eNY!3X9^n4z9d{cVDPn6u`N6GrV zdtYuIy-X?Fyh@WB$+1BEp@C!z8q03kb*E{s+EwS|(<_%v)UvnZE#oS90KA&aL->`; zwq<433$Q`vWTmW}RkCVU%j#JpYi3K#e!FzKoGmlwBkObpv~OoCXZGo}?Ai;5l~|co z-a5Z^POqP?X4l`r7`|Q4R*N3I%T|XDtOhv$0Vv$DfL4pu4jO;7MKfzpda9hY-c`=5 z*ETC1l*ON5WN5YEOMC9iUtwgvv|+yrE5Uv@V0M)`*^M&?kkp>_RV};0>Tm7So7qj^ zr1jc4y#>A1tPQ=}*{$sM9dNXdtAGe{zk4}R z0xl+f#xGF7y`oqVMUhEAg8iuH#c8B2kz|o+9Px1QtaqF>*?;Lbuz2~P5A4A?aynhew@ zz9fl=4*~M*oO|!biU!74`4O{zh|w(6_p9onWdc2DHftg&Ydjp7A_mh9m*RQ~Zj>EFxqo zOEG^gcuWyFbrW?})cz3b%*g)NrkM$*>mglyEojMDcfJrKcsLjXF6G)qg`bOc7nJ_> z#7Dt{%sGJ2q2fYobSf8-5cfiQv}U{&{HXxHo@N%J;~a89M`#KC9#0eNQyL3WXm$(x zKRSYHEZ}l$YlerRh(j`;@`xWMu~0;3zI`nE;Cv(2%=ofn+EZ8%@ad+-avcz7+vxCt zgvjHUuy% z%psaWQ4S?WA^irFsSwdRpMA@NAYLQkF=6ot6Rnpj{@EI#402P#J@5)wuaX*{-R?#j zNvLx93GdwN67m!B0PA{!XGU^=%7IC>(;R%Bk7xIv&s2bE@amaAhnZ8d#*0xbtv9It z3qb+Ci|3NUX&uk!~&7!t`EnOXlWZr-T=Xwx4d49c!=F?qdzGULQ z%Y07>IJa8wb@NYn-~e7?e)t}b_jpOt@<$QWAaEyur=_~U6SG)>2JQ=zz*l*#uxVa` z1P_OKyaA}{vyLn9lA?Q8tm2wG*x-p-e1y#pu=yC9A7b+(Y<`T*C)f~dK7}S@z@mQvh826b~AxvW8*_KFIILdvh8u^yIf=aRpA$y_xw?WS|nzUf@En>PHz^+U^U zL5qJa7@v?UFCvPqy6wH<+1c6I+v`^G<|d*c$D1411(c@&_FJwC_aTA#qHjLLm%v}% zF0NTTaAVh<3DmX4P53TQE>VXDEiTbWb-UGqE2B&Q^h~@&(m}&q4%JNni6V^yfwxwo r0!?o3X$fuBHWW4pRXrh8TKb_@N~u7iveXVz6FjI+rzR!C>c6l zMeng&+WKM4WfrB$rR#}&z(thwTbVC+rTZtyNZN)f7su2_Bw3bH zQS5KWyiHZ}rr==|;5FZmVKM6lczE%l+W5&EKsfOzsBVB_p!_rAA?bnAp>^bXW?17^ zXg{N2IdqP|2w}aLyI~dP_CsP3uMEmH)aF9&rPzg-T@bm>@B1<1d;V|~bE!^_Qkk)l zNCPe<>x~mFX+S|516m2132gZKJM_In;OJ(mqz#i;XVN834v0usGUv>JUp6rBv=6B?_ zAMA7v_S2+)rL!INWf1Ws;2jx=Xq3qg-y2=JeEHMP-o3FOcCxuY**$-2v@a^~_S?`T zN^4L5ku`Dd08nxp&;AHE3>?@~5r4r5oG~9w9dHiKok3-0PAVXYnO8s( zckgQE=I*pQBZu?``RU@MnpdZ_+$x$b@&&Bc^XiP|?wp%fpOWY1RdVme9e8Wn z$lVzPKw?E?;=+s8VC6BDjvh51Q?Ze)PS>&%)Abp_eZW6C-N037r=V@Zc^EV3A#6%r z&;1f?J@yeNx7((whJ5Z=n5%U#dVwJprx2%U67Q>O)K5~u(JN9>);_IjI<_%dB+tAP zeV(x;Cr_&dA_ zu?m>f=!3*nU=1)iFb1Z0A0tv@t@JOfU3&x8sK%Pcquhb>bG*m7a|_?H3%r#>Yi4Q0 z(Iw&>z*OZG#Lcl)(ujM4czg+Qr6uB?+#y*luMEo2>bV=*k4!9LfSH?1TmWVSlvE;4 zAVI&XauJN;B+Ov>Eo4@pF7cQ}p%fn>yD^6e+VFMnYgC9$*d{1w4+SFfQe9Q&GoFh#Y~n1+A;Al|$D{&9>&X zS5%p?Fb(i2C~%4FrDDj16j-VS)@gO-W&YNp3XZcVmfAbs=KOEUUhHoLkUh6xfC}(f zSvdD9ONKz;dZ&)}>(sPWU|yWZH!+69dq_~|V|`xE16SwvxrTX0V!gDCCScx%Y_Mkj E1Bt}9KL7v# literal 0 HcmV?d00001 diff --git a/scripts/exp-006/xquad/__pycache__/utils_qa.cpython-37.pyc b/scripts/exp-006/xquad/__pycache__/utils_qa.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85335495cb3f1c0a2d45627e5c52893967850bb8 GIT binary patch literal 13232 zcmeHOO>7)TcJ9CV;gB2-$q|1gszv>AByvbe{@GPzt-Wi@@Mc%GWyNy1s~t93J!Dfe z-NWi0iDYWBiKX3~ykU}KWPn5h1j!-DA=v~8VEAO@l!F5-f<{gWuz-Rjmn6=@hfTg$ z-SbC@TFLPuSil_ho9g;`_3G8DS5@yDeYL+oqu|f_@O|rF-d2=hjxIH1Mrg64m5lvYJ{jgXk1}E_tPxHdhaC` z55-*iSYL}z`jLEySCdD?k2v+;?_pUs08S%JdrK+gwl8Gf@cdw|#=RBW_Z@fDtOriT z2^`xu%N{p3>b4&^o@-jJf5+yiP|Q4lf|RQTElj^&qcW=!Ca%|O6}zB@MzF!ES03b1 z}-ZT6>k+y zZ`+j>Irk7{0T0dg*YNm1N+?R82<qT5Qk7hy?L)OCNg3S zrQ@PkeoqLMzF8J|lTmuNj}5T?J*{zAjFYCs1kq8YaYT%;{#)rC)vqCyL&=d_#*WIr z&j!Vj&bn;qzR?&7`ggNpd{1S=!su04eoSG*-vg)J0gQf1mQEZ{dUW$gVhXzayN0>> zkvNLf2aRLGgbpS#V+ly`u{hTCHYrS64^q01#iTeEziE7EN8LbeVcvoUsGTq&Nq$rj z2}rN;^RoOf+VnmD}M7vzKq`#H3y?P&frSw4yxjQSvrMEtQ1%bFOcp2Y<92%a5p zqy;%s=>G=#-uPQdM{X!WM^Aqxi025^h_u<54hF?28)2jO6gI}j-_u#YFun(0($Hj! zPl6#f!6xs)g312ws*vszp|!Z=*nU=d+L~iywd`o-LXAoQfbU=n!&i3!CEDGhhV)0$XtMAC#7V6E;Xh5 zKW@t)aw1PF@RH(gGrf`BG=(N$yDj+)o)KA=e_m;1F}8mN2U*@YBc{a}c4}N1Q}BJ7 zz8hzQ332wWDu$Xo#aWSrO&I(QcC@3X99m%f?Ho4K9;X;=w%+(p^p=u#l;9r+lk10f zkBHQsx?}je!IVg06&K!@zDSlG6)BOH7HjYyvNLX4q*3GS`x;i>>_avFfz8G>=46dI zSVV&N?W>pI3M_>^c5J`px*P3~a{HA6 z#NBvqWy|DtQ0K1c2NtBUTxQyMu*EsxosoUM#8>@`GDQQoH?1mmX8$5OwoTV_vDMpg zcQ)3f6+ni-obhd2id3~6a9`n0ErarAD*3)GIP;e zUca~`PZr;DYekn?oLgJ7W;=azsn8xO4VMPl$s?MJXdW+`!5z#T@$lt@V5oH9p_z#E z8R=1~l``9^h1&g^jd{T_VykGr7WgeKP_Jc&`+>=vs_oK|#;n%y3Vp4tN#<_7x@>c@ z3%g>YP9SY08eLO~OKrU#a4<4seJp!kWog!2^5U}>(#$Sf^$OU+G9$KEtcq{%SI&(! z8^eMi9EKom*{r+F=9Mi{L(>$x#2xlM-V;|9TDrp@)*bm0(ET1V_?>6AY`h-xVl53j^#L>|@9F!gU(aK3TR%l9qY+@Z;sO z&21MKVwiPs$A-U&&6`Ozcl8&a1ZJaCfOQ*Yh#hpPt*Z_NVz{`HelE(RZ6xvw*0OgS zwnAqR*}GKPQqhP@qkV{#tkArc94_-%CqqfCJ5H@y1KnuD*8l z)usJgOsmGy5;4cJ$Bt>v*u~W%wyZ}qm!dq-a1-DDCJEh)4=kCu($_2GJYn1cMHixd zi1zT-Haa8s?NYRF&(ek+rwc`{5>Mk*c+u^_E~6p~+Rn6=TXBVI(w~mU!1ER!e+0zA zZTe;w_Y4&`DntZ+PunowQvO)sITRN3P%AEksuQZWWQHW4zC6ELEBo`bo6h^oSmg8D zTb{dmety|m^;aA+tNG}B<K2=)z>8nnu!^;tA;D7MNs8t+EvLcngTUcH`}`B6m<&Imnv3unOT?r z5O4p1N7W2m7Dl_6q-=#loef`|>` zdk|CSlc>?psv%S<1xzmU5%d!5)?9YHG~V)$Sp+En+!7! z!l-0S9zUOWa zb~z)NHR8%Z?n6~aO9vEW=@3)$81svg!9ZzW#>iWiJdJg*q`fUvi=lew0r_r>G3nq^ z;|8Ic*>2Wv_b2a2+xeJJi#>5EoCtLs^25yAR;6yg!nw!EOVz9L-Vye$cojMVW7qY= z-uTKzUOF^L*{B<|7OdCgd$P)Zs2d9bP!p{dzUjx1Aa%^9?o6 z4~?3IXnDf%DH@)8#l7ut&!uBSG%{n|bKKDIAaj_w!(kVpK}ru3tX{48g@rKPj2(pO z=FybXD&gdhIU*|xr-(YIP&)4;(oZA{vY5kxw$$P7{lzThw0yoF4!5_ZQs9+n&Njn* zv!hbY=B2m>9|A`#nkd{54iKXb0|+JOXolqga?v3?0taMHf2%LqODcX12V$PoyMyE2 z;xLZ$kCdg@$HQ;M;NG?c3f!c#y%*aMk zTj4-^*rl>lvBONHSD9|F&ZsOYt%mP&AmPNp{^=-$^gjYo=K9pMn%45_v_7I{)FE|3 z$Gd8%X+5Kw`h=F&4As!H8glhMJ*Vf?F-=o7HL3O)Q|c7jWQ?pTKboN(MqNCrp^YFf z2@XjugO(Hegqi_o)K;gAqB>&a)x6rL=hdv9R7Z3JZ&}pKt2xxjYB?jR<_)BfPO69F z2Pu6>?Ng^AC(&f|t?^BaOHm7~))H|OMog#cGLZ=4zp;JSRVMMKSVz>jH8ltH0#=9+aDjy?Khk#Cf z7^Pi}^#au)geoKB$b?W4E`==^aov8TfJq<)%#6x+w9P;5La4Nx_pt?!0)qn1K^PSH zO#q>q1VY8dxJ0PdfI9UG!ZD5jKcnv{;92y2RAlA1DF)>CF`VF$FOjS1C@qnzUc}Fj z1GDOtSkSKn@0lRvhrcV1H73xL7W9jxMCv|i3~l~9-cs`IpBg8Ete6tORe|lD1S*iq z10@0Obqe{W0Cs@<)8Zt?IUS==_r>Y1w`p+_5=A}V2jT#nLB466@E_C`CnX9cQ$V3| zqfHdbl;y`!n^341#c`le1L7o5s6mNBy(r5kf%Qav5DFFX2Mz}m>Ilg#j*^t%+40ta zLfx16(+?yyG1ECQ$%a`TXwxA!0<@{Og*J^bnxB~#l2j0d-9a`6{N@bsnllnfIztGc zM3M?@9C(ek@xyK;i6?_*99qbWu?Qmq_Mr3c17n&1$}`f;1!~pm zTO4ZUZu}HH(o+8h|4A^hKDm2Xq=DY({zt(PiOo#M=na)kiL~gEbF1@zWXFNI^uY2@ zNX%tkw(hL)Q+6`0aZ1)$fMfwf+J%h%eGZT9M9qumj8J{IvtxI-LDTua~=caAf^?aM-Wu< z%{w+lTO+_>r9$Ud+`c<+>tEnLU*JAp;64=21m?Ey*FsMg*ZhUVeWL4uJWxK_D4*RI zoxd*emL@cbBWgDa8hD<0%eHF~WDxmH>Cz;~V#g3c2L!m(?Udw%fY6=hbCnVdDYgv6 z8@}-Kk+B`EZ>sVOWFtoN{4bCVX~`|{V(Dp+4Zejt-S&m0_PyFa5ZNf>*99$TetDgmn zXh%~xg%Yk53>oK^ap*AHioqaK<1L4MaTG?rwL-m53M_DwB;6MqJw|!pz^9See&Mqz zqYX3g{GpC_eL5SE`Ka-IkUVq_q-xj=>~<5?p6EP{gbI~^+e`e(F> zCOpy=t>@2y`_G!7cJIF!v3mYIsqsrhzD&bY`9(_q63A1A>yPtG)ba%)mx+9Z$csc? zBJx!tUnBA|kynUZAwt1-{wheA;Fh~;hdPi>{&mXzWg=&Ze3OW@i>s7+oyZ$R-XwC3 z2!-1DTSTrC`4uAgUj+)kLF64Gze;40$hSZW7x=d+eUr$eLh0L7@M}c6)b_7m|BlQ+}xvb>G)a^Y+j?h4I-R~Pb45x zCvuy}9U{3F$s-k>eTrrGUufX}35YUFLFnv5LFnAWA!rI#$ARcPLd`h@i2LaCxDa%L zLfTQ_IolO@&Le>Ql!4~og0zLvFjXosZ>3axT&MB` z@T=NZgfjKe|IH{`bwGHu1i~KRPv3~~sZxo48Uog|R>D;L`;IUb|L8I_=x2xgyTn(T zy<9C6|K=oH$9%|{k&}|XgkP`LEB0kVH~j?=g!ki54uSnXJ(Hr4e)DtjyULr*boBkd D(iz@< literal 0 HcmV?d00001 diff --git a/scripts/exp-006/xquad/eval_germanquad.sh b/scripts/exp-006/xquad/eval_germanquad.sh new file mode 100644 index 0000000..d18edbe --- /dev/null +++ b/scripts/exp-006/xquad/eval_germanquad.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-006-eval_germanquad + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/eval_germanquad.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/eval_germanquad.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="gpt2" + TOKENIZER_NAME="gpt2" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/germanquad/$lr" + CACHE_DIR="$FP_BIGS/data/external/xquad" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-006/xquad/eval_qa.py \ + --output_dir $OUTPUT_DIR \ + --dataset_name "deepset/germanquad" \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 50 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --model_name_or_path $MODEL_NAME \ + --tokenizer_name $TOKENIZER_NAME \ + --do_train \ + --do_predict +done diff --git a/scripts/exp-006/xquad/eval_qa.py b/scripts/exp-006/xquad/eval_qa.py new file mode 100644 index 0000000..402ff31 --- /dev/null +++ b/scripts/exp-006/xquad/eval_qa.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer. +""" +# You can also adapt this script on your own question answering task. Pointers for this are left as comments. + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +from datasets import load_dataset, load_metric + +import transformers +from trainer_qa import QuestionAnsweringTrainer +from transformers import ( + AutoConfig, + AutoModelForQuestionAnswering, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PreTrainedTokenizerFast, + TrainingArguments, + default_data_collator, + set_seed, +) + +from GPT2ForQuestionAnswering import GPT2ForQuestionAnswering +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version +from utils_qa import postprocess_qa_predictions + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +# check_min_version("4.13.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") + +logger = logging.getLogger(__name__) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_seq_length: int = field( + default=384, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " + "be faster on GPU but will be slower on TPU)." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + }, + ) + version_2_with_negative: bool = field( + default=False, metadata={"help": "If true, some of the examples do not have an answer."} + ) + null_score_diff_threshold: float = field( + default=0.0, + metadata={ + "help": "The threshold used to select the null answer: if the best answer has a score that is less than " + "the score of the null answer minus this threshold, the null answer is selected for this example. " + "Only useful when `version_2_with_negative=True`." + }, + ) + doc_stride: int = field( + default=128, + metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, + ) + n_best_size: int = field( + default=20, + metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, + ) + max_answer_length: int = field( + default=30, + metadata={ + "help": "The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another." + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + and self.test_file is None + ): + raise ValueError("Need either a dataset name or a training/validation file/test_file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=True, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer.pad_token = tokenizer.eos_token + + # model = AutoModelForQuestionAnswering.from_pretrained( + # model_args.model_name_or_path, + # from_tf=bool(".ckpt" in model_args.model_name_or_path), + # config=config, + # cache_dir=model_args.cache_dir, + # revision=model_args.model_revision, + # use_auth_token=True if model_args.use_auth_token else None, + # ) + + model = GPT2ForQuestionAnswering.from_pretrained(model_args.model_name_or_path, num_labels=2, pad_token_id=0) + + # Tokenizer check: this script requires a fast tokenizer. + if not isinstance(tokenizer, PreTrainedTokenizerFast): + raise ValueError( + "This example script only works for models that have a fast tokenizer. Checkout the big table of models " + "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " + "requirement" + ) + + # Preprocessing the datasets. + # Preprocessing is slighlty different for training and evaluation. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + elif training_args.do_eval: + column_names = raw_datasets["validation"].column_names + else: + column_names = raw_datasets["test"].column_names + question_column_name = "question" if "question" in column_names else column_names[0] + context_column_name = "context" if "context" in column_names else column_names[1] + answer_column_name = "answers" if "answers" in column_names else column_names[2] + + # Padding side determines if we do (question|context) or (context|question). + pad_on_right = tokenizer.padding_side == "right" + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + # Training preprocessing + def prepare_train_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + # cls_index = input_ids.index(tokenizer.cls_token_id) + + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples[answer_column_name][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + logger.debug("Yes") + assert False + # tokenized_examples["start_positions"].append(cls_index) + # tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). + if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and token_end_index to the two ends of the answer. + # Note: we could go after the last offset if the answer is the last word (edge case). + while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + # We will select sample from whole data if argument is specified + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + # Create train feature from dataset + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) + if data_args.max_train_samples is not None: + # Number of samples might increase during Feature Creation, We select only specified max samples + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + # Validation preprocessing + def prepare_validation_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples + + if training_args.do_eval: + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_examples = raw_datasets["validation"] + if data_args.max_eval_samples is not None: + # We will select sample from whole data + eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + # Validation Feature Creation + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) + if data_args.max_eval_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + if training_args.do_predict: + if "test" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_examples = raw_datasets["test"] + if data_args.max_predict_samples is not None: + # We will select sample from whole data + predict_examples = predict_examples.select(range(data_args.max_predict_samples)) + # Predict Feature Creation + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) + if data_args.max_predict_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + + # Data collator + # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data + # collator. + data_collator = ( + default_data_collator + if data_args.pad_to_max_length + else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) + ) + + # Post-processing: + def post_processing_function(examples, features, predictions, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=data_args.version_2_with_negative, + n_best_size=data_args.n_best_size, + max_answer_length=data_args.max_answer_length, + null_score_diff_threshold=data_args.null_score_diff_threshold, + output_dir=training_args.output_dir, + log_level=log_level, + prefix=stage, + ) + # Format the result to the format the metric expects. + if data_args.version_2_with_negative: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + ] + else: + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") + + def compute_metrics(p: EvalPrediction): + return metric.compute(predictions=p.predictions, references=p.label_ids) + + # Initialize our Trainer + trainer = QuestionAnsweringTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + eval_examples=eval_examples if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + post_process_function=post_processing_function, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Prediction + if training_args.do_predict: + logger.info("*** Predict ***") + results = trainer.predict(predict_dataset, predict_examples) + metrics = results.metrics + + max_predict_samples = ( + data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) + ) + metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) + + trainer.log_metrics("predict", metrics) + trainer.save_metrics("predict", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/exp-006/xquad/trainer_qa.py b/scripts/exp-006/xquad/trainer_qa.py new file mode 100644 index 0000000..3e005e9 --- /dev/null +++ b/scripts/exp-006/xquad/trainer_qa.py @@ -0,0 +1,105 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A subclass of `Trainer` specific to Question-Answering tasks +""" + +from transformers import Trainer, is_torch_tpu_available +from transformers.trainer_utils import PredictionOutput + + +if is_torch_tpu_available(): + import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met + + +class QuestionAnsweringTrainer(Trainer): + def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): + super().__init__(*args, **kwargs) + self.eval_examples = eval_examples + self.post_process_function = post_process_function + + def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): + eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset + eval_dataloader = self.get_eval_dataloader(eval_dataset) + eval_examples = self.eval_examples if eval_examples is None else eval_examples + + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + self.compute_metrics = None + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + try: + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + ) + finally: + self.compute_metrics = compute_metrics + + if self.post_process_function is not None and self.compute_metrics is not None: + eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) + metrics = self.compute_metrics(eval_preds) + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + self.log(metrics) + else: + metrics = {} + + if self.args.tpu_metrics_debug or self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) + return metrics + + def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): + predict_dataloader = self.get_test_dataloader(predict_dataset) + + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + self.compute_metrics = None + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + try: + output = eval_loop( + predict_dataloader, + description="Prediction", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + ) + finally: + self.compute_metrics = compute_metrics + + if self.post_process_function is None or self.compute_metrics is None: + return output + + predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") + metrics = self.compute_metrics(predictions) + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) \ No newline at end of file diff --git a/scripts/exp-006/xquad/utils_qa.py b/scripts/exp-006/xquad/utils_qa.py new file mode 100644 index 0000000..dedbd85 --- /dev/null +++ b/scripts/exp-006/xquad/utils_qa.py @@ -0,0 +1,431 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Post-processing utilities for question answering. +""" +import collections +import json +import logging +import os +from typing import Optional, Tuple + +import numpy as np +from tqdm.auto import tqdm + + +logger = logging.getLogger(__name__) + + +def postprocess_qa_predictions( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + null_score_diff_threshold: float = 0.0, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the + original contexts. This is the base postprocessing functions for models that only return start and end logits. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): + The threshold used to select the null answer: if the best answer has a score that is less than the score of + the null answer minus this threshold, the null answer is selected for this example (note that the score of + the null answer for an example giving several features is the minimum of the scores for the null answer on + each feature: all features must be aligned on the fact they `want` to predict a null answer). + + Only useful when :obj:`version_2_with_negative` is :obj:`True`. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 2: + raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") + all_start_logits, all_end_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(tqdm(examples)): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_prediction = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction. + feature_null_score = start_logits[0] + end_logits[0] + if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: + min_null_prediction = { + "offsets": (0, 0), + "score": feature_null_score, + "start_logit": start_logits[0], + "end_logit": end_logits[0], + } + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or offset_mapping[end_index] is None + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_logits[start_index] + end_logits[end_index], + "start_logit": start_logits[start_index], + "end_logit": end_logits[end_index], + } + ) + if version_2_with_negative: + # Add the minimum null prediction + prelim_predictions.append(min_null_prediction) + null_score = min_null_prediction["score"] + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Add back the minimum null prediction if it was removed because of its low score. + if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions): + predictions.append(min_null_prediction) + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): + predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction. If the null answer is not possible, this is easy. + if not version_2_with_negative: + all_predictions[example["id"]] = predictions[0]["text"] + else: + # Otherwise we first need to find the best non-empty prediction. + i = 0 + while predictions[i]["text"] == "": + i += 1 + best_non_null_pred = predictions[i] + + # Then we compare to the null prediction using the threshold. + score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] + scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. + if score_diff > null_score_diff_threshold: + all_predictions[example["id"]] = "" + else: + all_predictions[example["id"]] = best_non_null_pred["text"] + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + +def postprocess_qa_predictions_with_beam_search( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + start_n_top: int = 5, + end_n_top: int = 5, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the + original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as + cls token predictions. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + start_n_top (:obj:`int`, `optional`, defaults to 5): + The number of top start logits too keep when searching for the :obj:`n_best_size` predictions. + end_n_top (:obj:`int`, `optional`, defaults to 5): + The number of top end logits too keep when searching for the :obj:`n_best_size` predictions. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 5: + raise ValueError("`predictions` should be a tuple with five elements.") + start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() if version_2_with_negative else None + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(tqdm(examples)): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_score = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_log_prob = start_top_log_probs[feature_index] + start_indexes = start_top_index[feature_index] + end_log_prob = end_top_log_probs[feature_index] + end_indexes = end_top_index[feature_index] + feature_null_score = cls_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction + if min_null_score is None or feature_null_score < min_null_score: + min_null_score = feature_null_score + + # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits. + for i in range(start_n_top): + for j in range(end_n_top): + start_index = int(start_indexes[i]) + j_index = i * end_n_top + j + end_index = int(end_indexes[j_index]) + # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the + # p_mask but let's not take any risk) + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or offset_mapping[end_index] is None + ): + continue + # Don't consider answers with a length negative or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_log_prob[i] + end_log_prob[j_index], + "start_log_prob": start_log_prob[i], + "end_log_prob": end_log_prob[j_index], + } + ) + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0: + predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction and set the probability for the null answer. + all_predictions[example["id"]] = predictions[0]["text"] + if version_2_with_negative: + scores_diff_json[example["id"]] = float(min_null_score) + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions, scores_diff_json \ No newline at end of file diff --git a/scripts/exp-007/madx_run_clm.py b/scripts/exp-007/madx_run_clm.py new file mode 100644 index 0000000..581b07c --- /dev/null +++ b/scripts/exp-007/madx_run_clm.py @@ -0,0 +1,593 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + model.resize_token_embeddings(len(tokenizer)) + + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info("Sanity check: loaded tokenized_datasets") + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info("Sanity check: saved tokenized_datasets") + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("Sanity check: loaded lm_datasets") + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("Sanity check: saved lm_datasets") + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/exp-007/run_clm_de.sh b/scripts/exp-007/run_clm_de.sh new file mode 100644 index 0000000..f1aa596 --- /dev/null +++ b/scripts/exp-007/run_clm_de.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=5-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:4 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-007-run_clm_de_madx + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_de_madx.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_de_madx.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +model_name="gpt2" +tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/exp-005/oscar-de-tokenizer" +cache_dir="${FP_BIGS}/data/external/oscar_de" +output_dir="${FP_BIGS}/data/processed/exp-007/madx-gpt2-de" +logging_dir="${FP_BIGS}/reports/exp-007/madx-gpt2-de" + +python $FP_BIGS/scripts/exp-007/madx_run_clm.py \ + --model_name_or_path $model_name \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_de \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --train_adapter \ + --adapter_config "pfeiffer+inv" \ + --language "de" \ + --num_train_epochs 6.0 \ No newline at end of file diff --git a/scripts/exp-007/run_clm_en.sh b/scripts/exp-007/run_clm_en.sh new file mode 100644 index 0000000..776222f --- /dev/null +++ b/scripts/exp-007/run_clm_en.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=5-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:4 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-007-run_clm_en_madx + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_en_madx.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_en_madx.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +model_name="gpt2" +tokenizer_dir="gpt2" +cache_dir="${FP_BIGS}/data/external/oscar_en" +output_dir="${FP_BIGS}/data/processed/exp-007/madx-gpt2-en" +logging_dir="${FP_BIGS}/reports/exp-007/madx-gpt2-en" + +python $FP_BIGS/scripts/exp-007/madx_run_clm.py \ + --model_name_or_path $model_name \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_en \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --train_adapter \ + --train_adapter \ + --adapter_config "pfeiffer+inv" \ + --language "en" \ + --num_train_epochs 6.0 \ No newline at end of file diff --git a/scripts/exp-007/run_clm_ko.sh b/scripts/exp-007/run_clm_ko.sh new file mode 100644 index 0000000..611b4f2 --- /dev/null +++ b/scripts/exp-007/run_clm_ko.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=5-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=3090-gcondo --gres=gpu:4 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-007-run_clm_ko_madx + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_ko_madx.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_ko_madx.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +model_name="gpt2" +tokenizer_dir="yongzx/gpt2-finetuned-oscar-ko" +cache_dir="${FP_BIGS}/data/external/oscar_ko" +output_dir="${FP_BIGS}/data/processed/exp-007/madx-gpt2-ko" +logging_dir="${FP_BIGS}/reports/exp-007/madx-gpt2-ko" +# ckpt_dir="${FP_BIGS}/data/processed/exp-007/ft-gpt2-2/checkpoint-25000" + +python $FP_BIGS/scripts/exp-007/madx_run_clm.py \ + --model_name_or_path $model_name \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_ko \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --train_adapter \ + --adapter_config "pfeiffer+inv" \ + --language "ko" \ + --num_train_epochs 6.0 \ No newline at end of file diff --git a/scripts/exp-008/xnli/xnli_de.py b/scripts/exp-008/xnli/xnli_de.py new file mode 100644 index 0000000..e21cee8 --- /dev/null +++ b/scripts/exp-008/xnli/xnli_de.py @@ -0,0 +1,151 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import BertTokenizer, BertForSequenceClassification + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +print("Arguments: ========") +print(args) + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = BertTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, pad_to_max_length=True) + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.sep_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +logger.info("Tokenizing the dataset...") +# tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = train_dataset.map(tokenize_function, batched=False) +full_val_dataset = val_dataset.map(tokenize_function, batched=False) +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="steps", + save_strategy="steps", + logging_strategy="steps", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args): + return BertForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=0, + cache_dir=args.cache_dir) + + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + + model = load_model(args) + training_args.report_to = list() + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-008/xnli/xnli_de_mbert.sh b/scripts/exp-008/xnli/xnli_de_mbert.sh new file mode 100644 index 0000000..e6d2ca4 --- /dev/null +++ b/scripts/exp-008/xnli/xnli_de_mbert.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-008-xnli_de_mbert + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="bert-base-multilingual-uncased" + TOKENIZER_NAME="bert-base-multilingual-uncased" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_${lr}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-008/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/exp-008/xnli/xnli_de_mbert_0shot.sh b/scripts/exp-008/xnli/xnli_de_mbert_0shot.sh new file mode 100644 index 0000000..0783b53 --- /dev/null +++ b/scripts/exp-008/xnli/xnli_de_mbert_0shot.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-008-xnli_de_mbert_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="bert-base-multilingual-uncased" + TOKENIZER_NAME="bert-base-multilingual-uncased" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_0shot_${lr}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-008/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --zero_shot +done diff --git a/scripts/exp-008/xnli/xnli_ko.py b/scripts/exp-008/xnli/xnli_ko.py new file mode 100644 index 0000000..5ebee8c --- /dev/null +++ b/scripts/exp-008/xnli/xnli_ko.py @@ -0,0 +1,197 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import BertTokenizer, BertForSequenceClassification + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +KLUE = namedtuple("KLUE", ["klue_split", "num_labels", "metric", "model_type"]) +KLUE_TASKS = { + "topic-cls": KLUE(klue_split="ynat", num_labels=7, metric="f1/macro", model_type="seq-cls"), + "sts-pearsonr": KLUE(klue_split="sts", num_labels=1, metric="pearsonr", model_type="seq-cls"), + "sts-binary": KLUE(klue_split="sts", num_labels=1, metric="f1/macro", model_type="seq-cls"), + "nli": KLUE(klue_split="nli", num_labels=3, metric="accuracy", model_type="seq-cls"), +} + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--klue_task", choices=KLUE_TASKS.keys(), default="nli") +parser.add_argument("--lang", type=str, default="ko") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +print("Arguments: ========") +print(args) + +# load dataset +klue_dataset = load_dataset("klue", KLUE_TASKS[args.klue_task].klue_split, cache_dir=args.cache_dir) +if args.zero_shot: + print("0️⃣ 0-Shot") + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + + if "test" not in klue_dataset: + _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = klue_dataset['validation'] + else: + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = klue_dataset['test'] +else: + print("👀 Supervised Training") + if "test" not in klue_dataset: + _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) + train_dataset = _train_dataset['train'] + val_dataset = _train_dataset['test'] + test_dataset = klue_dataset['validation'] + else: + train_dataset = klue_dataset['train'] + val_dataset = klue_dataset['validation'] + test_dataset = klue_dataset['test'] + + +# load tokenizer +tokenizer = BertTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) + +def tokenize_function(examples): + if KLUE_TASKS[args.klue_task].klue_split == "ynat": + return tokenizer(examples["title"], max_length=128, padding="max_length", truncation=True) + elif KLUE_TASKS[args.klue_task].klue_split == "sts": + return tokenizer(f'{examples["sentence1"]} {tokenizer.sep_token} {examples["sentence2"]}', max_length=128, padding="max_length", truncation=True) + elif KLUE_TASKS[args.klue_task].klue_split == "nli": + return tokenizer(f'{examples["premise"]} {tokenizer.sep_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def postprocessing(example): + if KLUE_TASKS[args.klue_task].klue_split == "sts": + example['labels'] = example['labels']['real-label'] + return example + else: + return example + +logger.info("Tokenizing the dataset...") +# tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +full_train_dataset = train_dataset.map(tokenize_function, batched=False).map(postprocessing) +full_val_dataset = val_dataset.map(tokenize_function, batched=False).map(postprocessing) +full_test_dataset = test_dataset.map(tokenize_function, batched=False).map(postprocessing) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +def compute_metrics(eval_pred): + logits, labels = eval_pred + + if "pearsonr" in KLUE_TASKS[args.klue_task].metric: + predictions = logits.flatten() + else: + predictions = np.argmax(logits, axis=-1) + + ### only for STS-binary + if args.klue_task == "sts-binary": + predictions = np.where(logits.flatten() > 3.0, 1, 0) + labels = np.where(labels > 3.0, 1, 0) + # print(predictions) + # print(labels) + # assert False + + # apply metric + metric = load_metric(KLUE_TASKS[args.klue_task].metric.split("/")[0]) + if "/" in KLUE_TASKS[args.klue_task].metric: + return metric.compute(predictions=predictions, + references=labels, + average=KLUE_TASKS[args.klue_task].metric.split("/")[1]) + else: + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=1, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + + +def load_model(args): + if KLUE_TASKS[args.klue_task].model_type == "seq-cls": + return BertForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=KLUE_TASKS[args.klue_task].num_labels, + pad_token_id=0, + cache_dir=args.cache_dir) + + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args) + training_args.report_to = list() + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-008/xnli/xnli_ko_mbert.sh b/scripts/exp-008/xnli/xnli_ko_mbert.sh new file mode 100644 index 0000000..37def1a --- /dev/null +++ b/scripts/exp-008/xnli/xnli_ko_mbert.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-008-xnli_ko_mbert + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="bert-base-multilingual-uncased" + TOKENIZER_NAME="bert-base-multilingual-uncased" + LANG="ko" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_${lr}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-008/xnli/xnli_ko.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh b/scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh new file mode 100644 index 0000000..af3686d --- /dev/null +++ b/scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-008-xnli_ko_mbert_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="bert-base-multilingual-uncased" + TOKENIZER_NAME="bert-base-multilingual-uncased" + LANG="ko" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_0shot_${lr}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-008/xnli/xnli_ko.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 10 \ + --learning_rate $lr \ + --per_device_train_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --zero_shot +done From 4975573425e6b8b3c509ae55c68c5e745ce5aa20 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 21 Dec 2021 16:00:04 -0500 Subject: [PATCH 044/142] remove exp-001 --- scripts/exp-001/README.md | 22 - scripts/exp-001/download_oscar_ko.py | 13 - scripts/exp-001/download_oscar_ko.sh | 30 -- scripts/exp-001/run_clm.py | 549 ------------------------ scripts/exp-001/run_clm_ko.sh | 58 --- scripts/exp-001/train_tokenizer_gpt2.py | 20 - 6 files changed, 692 deletions(-) delete mode 100644 scripts/exp-001/README.md delete mode 100644 scripts/exp-001/download_oscar_ko.py delete mode 100644 scripts/exp-001/download_oscar_ko.sh delete mode 100644 scripts/exp-001/run_clm.py delete mode 100644 scripts/exp-001/run_clm_ko.sh delete mode 100644 scripts/exp-001/train_tokenizer_gpt2.py diff --git a/scripts/exp-001/README.md b/scripts/exp-001/README.md deleted file mode 100644 index 75eb89d..0000000 --- a/scripts/exp-001/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# README - -- use `download_oscar_fr.sh` to download the datasets. To download datasets for other languages, make the necessary changes on line 8 in the `download_oscar_fr.py`. -- run `train_tokenizer_gpt2.py` to train the tokenizer for the new dataset. Make necessary changes on line 8 to load the dataset and line 20 to save the trained tokenizer. -- run `run_clm.sh` to train GPT-2. Important changes to arguments that might be made: - - `tokenizer_dir`: directory of saved tokenizer. - - `cache_dir`: directory of cached dataset from `download_oscar_fr.sh` (remember to make changes to the dataset use in the argument `dataset_name` and `dataset_config_name`). - - `output_dir`: directory where the gpt2 is checkpointed during training. - - `ckpt_dir`: used for continuing training from checkpoint. - ---- - -# Decisions - -**Dataset**: HF's OSCAR unshuffled_deduplicated_fr - -**Tokenizer**: byte-level Byte-pair encoding tokenizer (same as GPT-2). Training is identical to the section "Using an existing tokenizer" in huggingface's tokenizer_training [tutorial](https://github.com/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb) -tokenizer_name: `/users/zyong2/data/zyong2/bigscience/data/processed/exp-001/oscar-fr-tokenizer` -- train the GPT-2 tokenizer with the exact same algorithms and parameters as an existing one. -- vocab_size: 50,257 (same as original GPT-2) - - diff --git a/scripts/exp-001/download_oscar_ko.py b/scripts/exp-001/download_oscar_ko.py deleted file mode 100644 index 534073d..0000000 --- a/scripts/exp-001/download_oscar_ko.py +++ /dev/null @@ -1,13 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("oscar", "unshuffled_deduplicated_ko", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_ko") - -from datasets.filesystems import S3FileSystem -s3 = S3FileSystem(key="KEY", secret="VAL") -dataset.save_to_disk('s3://bigscience-add-lang/oscar_ko', fs=s3) -print("Done") \ No newline at end of file diff --git a/scripts/exp-001/download_oscar_ko.sh b/scripts/exp-001/download_oscar_ko.sh deleted file mode 100644 index f6bff6d..0000000 --- a/scripts/exp-001/download_oscar_ko.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=3-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-031-download_oscar_ko - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_ko.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/download_oscar_ko.err - -# Set up the environment by loading modules -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -python3 $FP_BIGS/scripts/exp-001/download_oscar_ko.py \ No newline at end of file diff --git a/scripts/exp-001/run_clm.py b/scripts/exp-001/run_clm.py deleted file mode 100644 index 4c1407f..0000000 --- a/scripts/exp-001/run_clm.py +++ /dev/null @@ -1,549 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. - -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=causal-lm -""" -# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. - -import torch -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional -import pathlib - -import datasets -from datasets import load_dataset - -import transformers -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - Trainer, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.testing_utils import CaptureLogger -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.11.0.dev0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - block_size: Optional[int] = field( - default=None, - metadata={ - "help": "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - model.resize_token_embeddings(len(tokenizer)) - for name, param in model.named_parameters(): - if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): - print(f"🥶 Freeze layer '{name}'") - param.requires_grad = False - else: - param.requires_grad = True - - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - - def tokenize_function(examples): - with CaptureLogger(tok_logger) as cl: - output = tokenizer(examples[text_column_name]) - # clm input could be much much longer than block_size - if "Token indices sequence length is longer than the" in cl.out: - tok_logger.warning( - "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." - ) - return output - - with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): - tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - print("Sanity check: loaded tokenized_datasets") - else: - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - print("Sanity check: saved tokenized_datasets") - - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): - lm_datasets = torch.load(str(saved_lm_datasets_fp)) - print("Sanity check: loaded lm_datasets") - else: - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - torch.save(lm_datasets, saved_lm_datasets_fp) - print("Sanity check: saved lm_datasets") - - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = lm_datasets["train"] - if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_dataset = lm_datasets["validation"] - if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - import gc - del tokenized_datasets - gc.collect() - - # Initialize our Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - print("Checkpoint:", checkpoint) - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - if training_args.push_to_hub: - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - trainer.push_to_hub(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/scripts/exp-001/run_clm_ko.sh b/scripts/exp-001/run_clm_ko.sh deleted file mode 100644 index 60fb27e..0000000 --- a/scripts/exp-001/run_clm_ko.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=6-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:4 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-001-run_clm_ko - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_ko.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/001/run_clm_ko.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -module load gitlfs/2.7.1 -source $FP_BIGS/env_lang_mod/bin/activate - -tokenizer_dir="yongzx/gpt2-finetuned-oscar-ko" -cache_dir="${FP_BIGS}/data/external/oscar_ko" -output_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-ko" -logging_dir="${FP_BIGS}/reports/exp-001/ft-gpt2-ko" -ckpt_dir="${FP_BIGS}/data/processed/exp-001/ft-gpt2-ko/checkpoint-195500" - -python $FP_BIGS/scripts/exp-001/run_clm.py \ - --model_name_or_path gpt2 \ - --tokenizer_name $tokenizer_dir \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_ko \ - --logging_dir $logging_dir \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 1000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ - --resume_from_checkpoint $ckpt_dir \ No newline at end of file diff --git a/scripts/exp-001/train_tokenizer_gpt2.py b/scripts/exp-001/train_tokenizer_gpt2.py deleted file mode 100644 index 20e6555..0000000 --- a/scripts/exp-001/train_tokenizer_gpt2.py +++ /dev/null @@ -1,20 +0,0 @@ -from datasets import load_dataset -from dotenv import load_dotenv -import os -from pathlib import Path - -load_dotenv(str(Path.home() / ".env")) - -dataset = load_dataset("oscar", "unshuffled_deduplicated_ko", cache_dir=f"{os.getenv('FP_BIGS')}/data/external/oscar_ko") - -def batch_iterator(): - batch_size = 1000 - for i in range(0, len(dataset), batch_size): - yield dataset['train'][i : i + batch_size]["text"] - -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("gpt2") -assert tokenizer.is_fast -new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) -new_tokenizer.save_pretrained(f"{os.getenv('FP_BIGS')}/data/processed/exp-001/oscar-ko-tokenizer") \ No newline at end of file From f0032869085f4a2c2df6607f6f29fc0713da76e8 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 21 Dec 2021 16:13:46 -0500 Subject: [PATCH 045/142] remove README --- scripts/README.md | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 scripts/README.md diff --git a/scripts/README.md b/scripts/README.md deleted file mode 100644 index c1da51c..0000000 --- a/scripts/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Current Experiments -- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_ko`. -- `exp-002`: evaluate gpt-2-{finetuned on OSCAR-KO, base} on KLUE's tasks (CLS, XNLI, PAWS) - -# Carbon Tracking -Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) From aa5f308f5ab53b2da893bba0004791b470ac8f16 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 9 Feb 2022 14:43:21 +0100 Subject: [PATCH 046/142] added scripts to evaluate each layer of pretrained LM (encoder only) for cross lingual sentence retrieval task --- .../compute_retrieval_acc.sh | 22 ++ .../compute_retrieval_acc_bs.sh | 10 + .../eval_sentence_retrieval.py | 221 ++++++++++++++++++ 3 files changed, 253 insertions(+) create mode 100644 scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh create mode 100644 scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh create mode 100644 scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py diff --git a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh new file mode 100644 index 0000000..a0afcd8 --- /dev/null +++ b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH -p gpu +#SBATCH --gres="gpu:1" +#SBATCH --ntasks=16 +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J eval_retrieval_acc + +# Specify an output file +#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/eval_retrieval_acc-%j.out +#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/eval_retrieval_acc-%j.err + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com + + +model=$1 +dataset=$2 +outdir=retrieval_acc_${model}-${dataset} +mkdir $outdir +python eval_sentence_retrieval.py $outdir --pretrained_model $model --tokenizer $model --dataset $dataset diff --git a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh new file mode 100644 index 0000000..5c7efc2 --- /dev/null +++ b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh @@ -0,0 +1,10 @@ +for model in tr5b-1B3-multilingual-alpha-checkpoints; do + for ch in 12000 55500 99000 100500 117000 118500; do + mname=${model}/ch${ch} + for dataset in flores ted_multi; do + outdir=retrieval_acc_${model}-${dataset} + mkdir -p $outdir + sbatch compute_retrieval_acc.sh ${mname} ${dataset} + done + done +done diff --git a/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py new file mode 100644 index 0000000..b718547 --- /dev/null +++ b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py @@ -0,0 +1,221 @@ +import logging +import argparse +import os +from datasets import load_dataset +from collections import namedtuple +import torch +import numpy as np +from transformers import BertTokenizer, BertModel +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import os.path +import sys +from loguru import logger +import random +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--pretrained_model", default="bert-base-multilingual-cased") +parser.add_argument("--tokenizer", default="bert-base-multilingual-cased") +parser.add_argument("--dataset", default="ted_multi") +parser.add_argument("--device", default="cuda") +args = parser.parse_args() + +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) +ted_lngs = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] +flores_lng = ["amh", "bos", "cat", "eng", "spa", "fra", "hin", "ind", "jpn", "por", "swh", "vie", "urd"] +bs_languages = ["id", "eu", "vi", "zh", "ur", "es", "ca", "pt", "fr", "en", "hi", "ar", "bn"] +lngcode_map = {"am":"amh", "bn":"bos", "ca":"cat", "en":"eng", "es":"spa", "fr": "fra", "hi": "hin", "id": "ind", "ja": "jpn", "pt": "por", "ur":"urd", "vi":"vie" } + + +print("Arguments: ========") +print(args) + + +def load_dataset_(args): + if args.dataset == "ted_multi": + return load_dataset_ted(args) + if args.dataset == "flores": + return load_dataset_flores(args) + + +def load_dataset_flores_for_lng(args, lng): + dataset = load_dataset("gsarti/flores_101", lngcode_map[lng])['dev'] + return dataset + +def load_dataset_flores(args): + dataset = {} + for lng in bs_languages: + if lng in lngcode_map: + load_dataset_flores_for_lng(args, lng) + return dataset + +def load_dataset_ted(args): + dataset = load_dataset("ted_multi")['validation'] + return dataset + +def get_talks(dataset, nb_talks): + talk_names = [] + for t in dataset['talk_name']: + if len(talk_names) < nb_talks and not t in talk_names: + talk_names.append(t) + + + print([(t1, len([t for t in dataset['talk_name'] if t == t1])) for t1 in talk_names]) + return talk_names + +def load_model(args): + if "xlm" in args.pretrained_model or "bert" in args.pretrained_model: + model = AutoModelForMaskedLM.from_pretrained(args.pretrained_model) + else: + model = AutoModelForCausalLM.from_pretrained(args.pretrained_model) + model.config.output_hidden_states=True + return model.to(args.device) + +Sample = namedtuple( + "Sample", + ("id", "hidden_state") +) + +def load_from_file(fname): + return torch.load(fname) + + +def get_hidden_states(args, model): + if args.dataset == "ted_multi": + dataset = load_dataset_(args) + nb_talks = 2 + talks = get_talks(dataset, nb_talks) + + emb = get_hidden_states_for_talks(dataset, model, talks, args.pretrained_model) + + outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-talks-valid-{len(talks)}" + + elif args.dataset == "flores": + nb_samples = 200 + emb = get_hidden_states_for_flores(args, model, args.pretrained_model, nb_samples = nb_samples) + outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-flores-{nb_samples}" + + retrieval_acc = {} + nb_states = model.config.num_hidden_layers + fig, ax = plt.subplots(1, int(nb_states/step), figsize=(12*int(nb_states/step), 10)) + + + with open(f"{outname}.log", 'w') as fout: + for state in range(0, nb_states, step): + plot_retrieval_acc(state, emb, ax[int(state/step)], fout) + + fig.tight_layout() + plt.savefig(f'{outname}-heatmap.png') + + +def get_hidden_states_for_flores(args, model, mname, nb_samples=50): + emb = {} + hidden_state_size = model.config.num_hidden_layers + for lng in bs_languages: + if lng in lngcode_map: + fname = f"{args.output_dir}/flores-{lng}-{nb_samples}-{mname.replace('/','-')}.pt" + if os.path.isfile(fname): + emb[lng] = load_from_file(fname) + else: + dataset = load_dataset_flores_for_lng(args, lng) + emb[lng] = {} + for state in range(hidden_state_size): + emb[lng][state] = [] + for i, sid in enumerate(dataset['id'][:nb_samples]): + t = dataset['sentence'][i] + x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) + out = model(x) + for state in range(hidden_state_size): + hs = torch.mean(out.hidden_states[state][0][1:-1], dim=0).detach() + emb[lng][state].append(Sample(sid, hs)) + torch.save(emb[lng], fname) + return emb + + +def get_hidden_states_for_talks(dataset, model, talks, mname): + emb = {} + hidden_state_size = model.config.num_hidden_layers + fname = f"{args.output_dir}/ted_multi-{mname.replace('/','-')}-ted_multi-{len(talks)}.pt" + if os.path.isfile(fname): + return load_from_file(fname) + for sid, sample in enumerate(dataset): + if sample['talk_name'] in talks: + tsample = sample['translations'] + for i, lng in enumerate(tsample['language']): + if lng in bs_languages: + t = tsample['translation'][i] + x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) + if not lng in emb: + emb[lng] = {} + for state in range(hidden_state_size): + emb[lng][state] = [] + out = model(x) + for state in range(hidden_state_size): + hs = torch.mean(out.hidden_states[state][0][1:-1], dim=0).detach() + emb[lng][state].append(Sample(sid, hs)) + torch.save(emb, fname) + return emb + + +def compute_sent_retrieval_acc(lng1, lng2, emb, state, out): + cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) + E1 = torch.stack([s[1] for s in emb[lng1][state]]) + E2 = torch.stack([s[1] for s in emb[lng2][state]]) + #cos_matrix = [[cos(E2[i],E1[j]) for i in range(E2.shape[0]) ] for j in range(E1.shape[0])] + match = 0 + intersection_ids = set([emb[lng1][state][i][0] for i in range(E1.shape[0])]).intersection( + set([emb[lng2][state][i][0] for i in range(E2.shape[0])]) + ) + if len(intersection_ids)>0: + random_acc = 1/len(intersection_ids) + for i in range(E1.shape[0]): + if emb[lng1][state][i][0] in intersection_ids: + cos_sim = [cos(E2[j], E1[i]) for j in range(E2.shape[0])] + best_match = torch.argmax(torch.stack(cos_sim)) + if emb[lng2][state][best_match][0] == emb[lng1][state][i][0]: + match +=1 + acc = match/len(intersection_ids) + out.write(f"{lng1}-{lng2} = {acc} (random {random_acc} )\n") + return acc, len(intersection_ids) + else: + return 0, 0 + +def plot_retrieval_acc(state, emb, ax, out): + cmap="RdYlBu" + mean_per_state = 0 + for lng1 in emb: + if not lng1 in retrieval_acc: + retrieval_acc[lng1] = {} + for lng2 in emb: + lng2_chance = 1.0/len(emb[lng2][0]) + #if not lng1 == lng2: + acc, random_acc = compute_sent_retrieval_acc(lng1, lng2, emb, state, out) + retrieval_acc[lng1][lng2] = acc + #retrieval_acc[lng1]["random"] = lng2_chance + mean_acc = np.mean([v for v in retrieval_acc[lng1].values()]) + out.write(f"ACC per {lng1}, layer {state} = {mean_acc} \n" ) + mean_per_state +=mean_acc + mean_per_state = mean_per_state/len(emb.keys()) + out.write(f"ACC overall, layer {state} = {mean_per_state}\n" ) + m_res = pd.DataFrame(retrieval_acc) + m_res.columns=emb.keys() + m_res.index=emb.keys()#[e for e in emb.keys()]+["random"] + ax.set_title(f"state {state}") + sns.heatmap(m_res, ax=ax, annot=False, vmin=0, vmax=1.0, center=0, cmap=cmap) + + + +lngs2consider = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] +samples = 10 +model = load_model(args) +retrieval_acc = {} +step=1 +get_hidden_states(args, model) From 75bf06e1f0912990445abb2b18d74330d5941c84 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 11 Feb 2022 15:35:47 +0100 Subject: [PATCH 047/142] fix due to diff tokenization for ted dataset --- .../exp_sentence_retrievale_eval/eval_sentence_retrieval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py index b718547..3fdf4e3 100644 --- a/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py +++ b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py @@ -145,7 +145,8 @@ def get_hidden_states_for_talks(dataset, model, talks, mname): hidden_state_size = model.config.num_hidden_layers fname = f"{args.output_dir}/ted_multi-{mname.replace('/','-')}-ted_multi-{len(talks)}.pt" if os.path.isfile(fname): - return load_from_file(fname) + emb = load_from_file(fname) + return emb for sid, sample in enumerate(dataset): if sample['talk_name'] in talks: tsample = sample['translations'] @@ -159,7 +160,7 @@ def get_hidden_states_for_talks(dataset, model, talks, mname): emb[lng][state] = [] out = model(x) for state in range(hidden_state_size): - hs = torch.mean(out.hidden_states[state][0][1:-1], dim=0).detach() + hs = torch.mean(out.hidden_states[state][0], dim=0).detach() emb[lng][state].append(Sample(sid, hs)) torch.save(emb, fname) return emb From 6b200d5c4125c6c12047af37c598ebc40e5d4fa3 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 23 Feb 2022 01:41:50 +0100 Subject: [PATCH 048/142] added scripts for training model with adapters and embedding layer FT --- scripts/madx_exp/madx_lngembft_clm.py | 617 ++++++++++++++++++++++++ scripts/madx_exp/run_clm_madx_lngemb.sh | 68 +++ 2 files changed, 685 insertions(+) create mode 100644 scripts/madx_exp/madx_lngembft_clm.py create mode 100644 scripts/madx_exp/run_clm_madx_lngemb.sh diff --git a/scripts/madx_exp/madx_lngembft_clm.py b/scripts/madx_exp/madx_lngembft_clm.py new file mode 100644 index 0000000..45b7c35 --- /dev/null +++ b/scripts/madx_exp/madx_lngembft_clm.py @@ -0,0 +1,617 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def load_tokenizer(model_args): + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + return tokenizer + + + +def load_data(data_args, model_args): + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + + if "validation" not in raw_datasets.keys(): + if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + elif data_args.max_eval_samples is not None : + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + else: + raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) + + raw_datasets['validation'] = raw_datasets['test'] + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + return raw_datasets + +def load_model(model_args, tokenizer): + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + #TODO: remap embedding parameters + #if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + model.resize_token_embeddings(len(tokenizer)) + return model + +def preprocess_data(training_args, data_args, model_args, tokenizer): + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") + + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info("Sanity check: loaded tokenized_datasets") + else: + raw_datasets = load_data(data_args, model_args) + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + + text_column_name = "text" if "text" in column_names else column_names[0] + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info("Sanity check: saved tokenized_datasets") + if "train" not in tokenized_datasets and training_args.do_train: + raise ValueError("--do_train requires a train dataset") + if "validation" not in tokenized_datasets and training_args.do_eval: + raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets + + +def get_lm_dataset(training_args, data_args, model_args, tokenizer): + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("Sanity check: loaded lm_datasets") + else: + + tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("Sanity check: saved lm_datasets") + return lm_datasets + +def add_adapters(adapter_args, data_args, model): + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + trainable_params = 0 + frozen_params = 0 + emb_params = 0 + for name, param in model.named_parameters(): + if not param.requires_grad: + if not "wte" in name and not "lm_head" in name: + print(f"🥶 Frozen layer '{name}'") + frozen_params +=param.numel() + else: + param.requires_grad = True + print(f"🚀 Trainable layer '{name}'") + emb_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + print(f"Total frozen parameters: {frozen_params}") + print(f"Total emb parameters: {emb_params}") + print(f"Total trainable parameters: {trainable_params}") + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + training_args.data_dir = f'{training_args.output_dir}/../' + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + pass + #raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + #) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + tokenizer = load_tokenizer(model_args) + model = load_model(model_args, tokenizer) + + add_adapters(adapter_args, data_args, model) + # Preprocessing the datasets. + lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) + if training_args.do_train: + train_dataset = lm_datasets["train"] + + if training_args.do_eval: + + eval_dataset = lm_datasets["validation"] + + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + +# if training_args.push_to_hub: +# trainer.push_to_hub(**kwargs) +# else: +# trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/scripts/madx_exp/run_clm_madx_lngemb.sh b/scripts/madx_exp/run_clm_madx_lngemb.sh new file mode 100644 index 0000000..4e1315b --- /dev/null +++ b/scripts/madx_exp/run_clm_madx_lngemb.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-009-run_clm_de_madx + +# Specify an output file +#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_de_madx-%j.out +#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_de_madx-%j.err + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com + +# Set up the environment by loading modules +source /tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/env/bin/activate +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience/ + +data_sample=100000 +ch=$1 +lng=$2 +dataset=oscar +adapter_config="pfeiffer+inv" +adapter_reduction_factor=48 +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +tokenizer_dir="${FP_BIGS}/tokenizers/bigscience-1.3B-${lng}-tokenizer" +cache_dir="${FP_BIGS}/data/${dataset}_${lng}" +data_dir="${FP_BIGS}/exp-009/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}" +data_tok_dir="${FP_BIGS}/exp-009/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}/lng_tok" +output_dir="${data_dir}/withlngembft-lmhead-${adapter_config}-${adapter_reduction_factor}" +logging_dir="${FP_BIGS}/logs/exp-009/madx-bs1b3-multi-ch${ch}-${dataset}-${lng}-sample${data_sample}-withlngembft-lmhead-${adapter_config}-${adapter_reduction_factor}" + + +python $FP_BIGS/multilingual-modeling/scripts/madx_exp/madx_lngembft_clm.py \ + --fp16 \ + --model_name_or_path ${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name \ + --tokenizer_name ${tokenizer_dir} \ + --dataset_name ${dataset} \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_${lng} \ + --logging_dir ${logging_dir} \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir ${output_dir} \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 2 \ + --eval_steps 5000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --train_adapter \ + --adapter_reduction_factor ${adapter_reduction_factor} \ + --language ${lng} \ + --num_train_epochs 6.0 \ + --adapter_config ${adapter_config} \ + --max_train_samples ${data_sample} From 481ee2cc70bf1f2f58f962b2f08cfbe018fa9312 Mon Sep 17 00:00:00 2001 From: yongzx Date: Sat, 5 Mar 2022 15:53:11 -0500 Subject: [PATCH 049/142] eval_xnli_de --- scripts/xnli_eval_de/adapters_xnli_de.py | 232 ++++++++++++++++++ .../xnli_eval_de/adapters_xnli_de_tr1.3B.sh | 60 +++++ .../adapters_xnli_de_tr1.3B_0shot.sh | 61 +++++ .../adapters_xnli_de_tr1.3B_emb>adpt.sh | 58 +++++ .../adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh | 59 +++++ .../adapters_xnli_de_tr1.3B_wpe.sh | 60 +++++ .../adapters_xnli_de_tr1.3B_wpe_0shot.sh | 61 +++++ scripts/xnli_eval_de/adapters_xnli_de_wpe.py | 232 ++++++++++++++++++ scripts/xnli_eval_de/xnli_de.py | 194 +++++++++++++++ scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh | 57 +++++ .../xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh | 62 +++++ 11 files changed, 1136 insertions(+) create mode 100644 scripts/xnli_eval_de/adapters_xnli_de.py create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh create mode 100644 scripts/xnli_eval_de/adapters_xnli_de_wpe.py create mode 100644 scripts/xnli_eval_de/xnli_de.py create mode 100644 scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh create mode 100644 scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh diff --git a/scripts/xnli_eval_de/adapters_xnli_de.py b/scripts/xnli_eval_de/adapters_xnli_de.py new file mode 100644 index 0000000..8ce10a0 --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de.py @@ -0,0 +1,232 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter", required=True) +parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + if args.zero_shot and not inference: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + else: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if not args.zero_shot or (args.zero_shot and inference): + # if not zero shot, that means that we need to replace the embedding layers during training + # we also need to replace embedding layers during inference + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + + # change the embedding layer of the original big science model + # by loading the adapters (which has saved lm_head) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + + # model has original bigscience embedding so replace it. + model.resize_token_embeddings(len(tokenizer)) + model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + + if not inference: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + if args.finetune_strategies == "whole": + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "lang_adapters": + model.train_adapter([args.adapter_lang_name]) + elif args.finetune_strategies == "task_adapters": + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + else: + raise ValueError("Lack configuration") + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + print("🔥 ==================== Inference: ==================== 🔥") + assert args.pretrained_adapters_dir + if args.finetune_strategies == "lang_adapters": + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "task_adapters": + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh new file mode 100644 index 0000000..a7d2507 --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +#SBATCH --array=16,48,384 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-013-adapters_xnli_de_tr1.3B + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_%a.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_%a.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_100K_adpt_${SLURM_ARRAY_TASK_ID}/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-pfeiffer+inv-${SLURM_ARRAY_TASK_ID}/oscar_de" + FT_STRATEGIES="task_adapters" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_de_100K_adpt_${SLURM_ARRAY_TASK_ID}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-de" \ + --finetune_strategies $FT_STRATEGIES +done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh new file mode 100644 index 0000000..3e65669 --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +#SBATCH --array=16,48,384 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_0shot_%a.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_0shot_%a.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_100K_adpt_${SLURM_ARRAY_TASK_ID}/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-pfeiffer+inv-${SLURM_ARRAY_TASK_ID}/oscar_de" + FT_STRATEGIES="task_adapters" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_de_100K_adpt_${SLURM_ARRAY_TASK_ID}_0shot" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-de" \ + --finetune_strategies $FT_STRATEGIES \ + --zero_shot +done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh new file mode 100644 index 0000000..ddc23d3 --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_emb>adpt + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_50000" + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/012/madx-bigs-de-50000-50000-48/oscar_de" + FT_STRATEGIES="task_adapters" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_de_100K_adpt_${SLURM_ARRAY_TASK_ID}" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-de" \ + --finetune_strategies $FT_STRATEGIES +done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh new file mode 100644 index 0000000..c52b12f --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-013-adapters_xnli_detr1.3B_emb>adpt_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_50000" + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/012/madx-bigs-de-50000-50000-48/oscar_de" + FT_STRATEGIES="task_adapters" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_de_50K>50K_adpt_0shot" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-de" \ + --finetune_strategies $FT_STRATEGIES \ + --zero_shot +done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh new file mode 100644 index 0000000..5f7e725 --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_wpe + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + ORIGINAL_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/pretrained_model" + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/oscar_de" + FT_STRATEGIES="task_adapters" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_de_wpe_adpt_0shot" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de_wpe.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --original_model $ORIGINAL_MODEL \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-de" \ + --finetune_strategies $FT_STRATEGIES +done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh new file mode 100644 index 0000000..4666eb1 --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_wpe_0shot + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe_0shot.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe_0shot.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_adapter/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 1e-5 ) +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + ORIGINAL_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/pretrained_model" + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/oscar_de" + FT_STRATEGIES="task_adapters" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_de_wpe_adpt_0shot" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de_wpe.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --original_model $ORIGINAL_MODEL \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ + --adapter_lang_name "xnli-de" \ + --finetune_strategies $FT_STRATEGIES \ + --zero_shot +done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_wpe.py b/scripts/xnli_eval_de/adapters_xnli_de_wpe.py new file mode 100644 index 0000000..82a673e --- /dev/null +++ b/scripts/xnli_eval_de/adapters_xnli_de_wpe.py @@ -0,0 +1,232 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM, GPT2Config + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter", required=True) +parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + print(args) + + if args.zero_shot and not inference: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + original_config = GPT2Config.from_pretrained(args.original_model) + original_model = GPT2ForSequenceClassification.from_pretrained(args.original_model, num_labels=3) + + # replace the embedding layer with original (contains-en) embedding. + logger.info("👉 Replace with en-langauge embedding") + model.resize_token_embeddings(original_config.vocab_size) + + model._modules['transformer']._modules['wte'] = original_model._modules['transformer']._modules['wte'] + model._modules['transformer']._modules['wpe'] = original_model._modules['transformer']._modules['wpe'] + logger.info(f"👉 Embedding (wte) changes to {model._modules['transformer']._modules['wte']}") + logger.info(f"👉 Embedding (wte) changes to {model._modules['transformer']._modules['wpe']}") + else: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if not inference: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + if args.finetune_strategies == "whole": + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "lang_adapters": + model.train_adapter([args.adapter_lang_name]) + elif args.finetune_strategies == "task_adapters": + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + else: + raise ValueError("Lack configuration") + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + print("🔥 ==================== Inference: ==================== 🔥") + assert args.pretrained_adapters_dir + if args.finetune_strategies == "lang_adapters": + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "task_adapters": + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/xnli_eval_de/xnli_de.py b/scripts/xnli_eval_de/xnli_de.py new file mode 100644 index 0000000..f88ce76 --- /dev/null +++ b/scripts/xnli_eval_de/xnli_de.py @@ -0,0 +1,194 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer +from transformers import AutoTokenizer, GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +print("Arguments: ========") +print(args) + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(pretrained_model, cache_dir, pad_token_id=0): + return GPT2ForSequenceClassification.from_pretrained(pretrained_model, + num_labels=3, + pad_token_id=pad_token_id, + cache_dir=cache_dir) + + +if args.do_train: + logger.info("Start Training") + model = load_model(args.pretrained_model, + args.cache_dir, + en_tokenizer.pad_token_id if args.zero_shot else tokenizer.pad_token_id) + + if args.zero_shot: + # model is the finetuned model + original_config = GPT2Config.from_pretrained(args.original_model) + original_model = load_model(args.original_model, args.cache_dir) + no_en_wte = model._modules['transformer']._modules['wte'] + no_en_wpe = model._modules['transformer']._modules['wpe'] + + # replace the embedding layer with original (contains-en) embedding. + logger.info("👉 Replace with en-langauge embedding") + model.resize_token_embeddings(original_config.vocab_size) + + model._modules['transformer']._modules['wte'] = original_model._modules['transformer']._modules['wte'] + model._modules['transformer']._modules['wpe'] = original_model._modules['transformer']._modules['wpe'] + logger.info(f"👉 Embedding (wte) changes from {no_en_wte} to {model._modules['transformer']._modules['wte']}") + logger.info(f"👉 Embedding (wte) changes from {no_en_wpe} to {model._modules['transformer']._modules['wpe']}") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + # FIXME: hack for now because the tokenizer loaded from bigscience doesn't have the same + # vocab size as indicated in the config.json + # not optimal fix for now because cooriginal_confignfig can be directly passed to from_pretrained + if args.zero_shot: + original_config.save_pretrained(args.pretrained_model) + + model = load_model(args.pretrained_model, args.cache_dir, tokenizer.pad_token_id) + if args.zero_shot: + # replace with target-language embedding. + logger.info("👉 Replace with target-language embedding") + logger.info(f"👉 len(tokenizer) = {len(tokenizer)}") + model.resize_token_embeddings(len(tokenizer)) + model._modules['transformer']._modules['wte'] = no_en_wte + model._modules['transformer']._modules['wpe'] = no_en_wpe + + training_args.report_to = list() + + trainer = Trainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh b/scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh new file mode 100644 index 0000000..4ef57e1 --- /dev/null +++ b/scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-013-xnli_de_ft_tr1.3B + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 5e-5 ) + +# following https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification#fine-tuning-on-xnli +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + # MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_100000" + MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + # TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + TOKENIZER_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/exp-013/xnli_${LANG}_ft_de_100000" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train +done diff --git a/scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh b/scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh new file mode 100644 index 0000000..a6700fd --- /dev/null +++ b/scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=1-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=2 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-013-xnli_de_ft_tr1.3B_0shot_ori + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B_0shot_ori.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B_0shot_ori.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +source $FP_BIGS/env_lang_mod/bin/activate + + +# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) +learning_rates=( 5e-5 ) + +# following https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification#fine-tuning-on-xnli +for lr in ${learning_rates[@]} ; do + echo "LR ===== $lr" + MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_100000" + # MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" # cross-lingual 0-shot (for BigScience original) + + #TOKENIZER_NAME affects the tokenization of test set + TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + # TOKENIZER_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" # cross-lingual 0-shot (for BigScience original) + LANG="de" + OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_ft_0shot_de_100000_ori" + CACHE_DIR="$FP_BIGS/data/external/xnli" + mkdir -p $OUTPUT_DIR + + python $FP_BIGS/scripts/exp-013/xnli/xnli_de.py \ + $OUTPUT_DIR \ + --lang $LANG \ + --cache_dir $CACHE_DIR \ + --num_train_epochs 2 \ + --learning_rate $lr \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 4 \ + --pretrained_model $MODEL_NAME \ + --tokenizer $TOKENIZER_NAME \ + --do_train \ + --do_eval_after_train \ + --zero_shot \ + --original_model "/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +done From fbb3cf304f201606f698ba18c657641512497a68 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 28 Mar 2022 21:36:10 -0400 Subject: [PATCH 050/142] update --- scripts/lang_adapt/README.md | 14 + scripts/lang_adapt/madx_run_clm.py | 638 ++++++++++++++++++++++++++++ scripts/lang_adapt/run_clm_emb.sh | 72 ++++ scripts/lang_adapt/tokenized4clm.py | 57 +++ 4 files changed, 781 insertions(+) create mode 100644 scripts/lang_adapt/README.md create mode 100644 scripts/lang_adapt/madx_run_clm.py create mode 100644 scripts/lang_adapt/run_clm_emb.sh create mode 100644 scripts/lang_adapt/tokenized4clm.py diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md new file mode 100644 index 0000000..eeb7ccd --- /dev/null +++ b/scripts/lang_adapt/README.md @@ -0,0 +1,14 @@ +# README + +### Tokenizer and Tokenization of Dataset +First, run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. +- `lang`: language name (e.g., "de", "th") +- `tokenizer_dir`: path directory to save the tokenizer. +- `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. +- `vocab_size`: vocab size of the tokenizer +- `extend_vocab`: whether we are extending the vocabulary of the embedding layer (determines the saved name of the tokenizer and for sanity check purpose with vocab_size) + - if `--extend_vocab`, we save the tokenizer as `{lang}_oscar_tokenizer_{vocab_size}` + - otherwise, we save the tokenizer as `{lang}_oscar_tokenizer_full` + +Then, +- use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. \ No newline at end of file diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py new file mode 100644 index 0000000..bcea14c --- /dev/null +++ b/scripts/lang_adapt/madx_run_clm.py @@ -0,0 +1,638 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + lang_adapt_strategies: str = field( + default="", + metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt'"}, + ) + embedding_strategies: str = field( + default="", + metadata={"help": "choose one of the two strategies - 'replace', 'extend'"}, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + data_lang: Optional[str] = field( + default=None, metadata={"help": "The language of the dataset"} + ) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def load_tokenizer(model_args): + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + return tokenizer + + + +def load_data(data_args, model_args): + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + + if "validation" not in raw_datasets.keys(): + if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + elif data_args.max_eval_samples is not None : + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + else: + raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) + raw_datasets['validation'] = raw_datasets['test'] + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + if data_args.max_train_samples is not None: + # FIXME: currently assume the loaded checkpoint is trained with the first data_args.max_train_samples number of samples + raw_datasets["train"] = raw_datasets["train"].filter(lambda example, indice: indice < data_args.max_train_samples, with_indices=True) + raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) + if data_args.max_eval_samples is not None: + raw_datasets["validation"] = raw_datasets["validation"].select(range(data_args.max_eval_samples)) + + return raw_datasets + +def load_model(model_args, tokenizer): + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + #TODO: remap embedding parameters + #if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + model.resize_token_embeddings(len(tokenizer)) + return model + +def preprocess_data(training_args, data_args, model_args, tokenizer): + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_data.pt") + + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info(f"✅ loaded tokenized_data") + else: + raw_datasets = load_data(data_args, model_args) + assert len(raw_datasets['train']) == data_args.max_train_samples + logger.info(f"🧠 Sanity check: loaded raw datasets have {data_args.max_train_samples} samples") + + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + + text_column_name = "text" if "text" in column_names else column_names[0] + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info(f"✅ saved tokenized_data") + if "train" not in tokenized_datasets and training_args.do_train: + raise ValueError("--do_train requires a train dataset") + if "validation" not in tokenized_datasets and training_args.do_eval: + raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets + + +def get_lm_dataset(training_args, data_args, model_args, tokenizer): + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_data.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("✅ loaded lm_data") + else: + tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("✅ saved lm_data") + print(lm_datasets) + return lm_datasets + +def modify_model(adapter_args, data_args, model_args, model): + if model_args.lang_adapt_strategies == "emb": + for name, param in model.named_parameters(): + if "wte" not in name and "wpe" not in name: + param.requires_grad = False + + # Setup adapters + elif adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + trainable_params = 0 + frozen_params = 0 + emb_params = 0 + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + frozen_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + + if "wte" and "wpe" in name: + emb_params += param.numel() + + print(f"Total frozen parameters: {frozen_params}") + print(f"Total emb parameters (wte, wpe): {emb_params}") + print(f"Total trainable parameters: {trainable_params}") + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + training_args.data_dir = f'{training_args.output_dir}' + + assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') + assert model_args.embedding_strategies in ('replace', 'extend') + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + pass + #raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + #) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + tokenizer = load_tokenizer(model_args) + model = load_model(model_args, tokenizer) + + modify_model(adapter_args, data_args, model_args, model) + # Preprocessing the datasets. + lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) + if training_args.do_train: + train_dataset = lm_datasets["train"] + + if training_args.do_eval: + eval_dataset = lm_datasets["validation"] + + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + +# if training_args.push_to_hub: +# trainer.push_to_hub(**kwargs) +# else: +# trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/lang_adapt/run_clm_emb.sh b/scripts/lang_adapt/run_clm_emb.sh new file mode 100644 index 0000000..cb397ff --- /dev/null +++ b/scripts/lang_adapt/run_clm_emb.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 +#SBATCH --array=100,200,500 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-020-run_clm_emb + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_emb_%a.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_emb_%a.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_lang_adapter/bin/activate + + +# axis +LANG="th" +MAX_TRAIN_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) +BIGS_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" + + +tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_full" +cache_dir="/users/zyong2/data/zyong2/huggingface/" +output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_emb_${MAX_TRAIN_SAMPLES}samples" +logging_dir="/users/zyong2/data/zyong2/bigscience/data/reports/020/${LANG}_emb_${MAX_TRAIN_SAMPLES}samples" +mkdir -p $output_dir +mkdir -p $logging_dir + +python $FP_BIGS/scripts/exp-020/madx_run_clm.py \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_$LANG" \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 5000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --save_steps 25000 \ + --save_strategy "steps" \ + --max_train_samples $MAX_TRAIN_SAMPLES \ + --max_steps 50000 \ + --lang_adapt_strategies "emb" \ + --embedding_strategies "replace" \ No newline at end of file diff --git a/scripts/lang_adapt/tokenized4clm.py b/scripts/lang_adapt/tokenized4clm.py new file mode 100644 index 0000000..db64a90 --- /dev/null +++ b/scripts/lang_adapt/tokenized4clm.py @@ -0,0 +1,57 @@ +import torch +import datasets +import transformers +from transformers import AutoTokenizer +from datasets import load_dataset +import pathlib + +import argparse +import sys + +import logging +logger = logging.getLogger(__name__) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) +log_level = -1 +logger.setLevel(log_level) +datasets.utils.logging.set_verbosity(log_level) +transformers.utils.logging.set_verbosity(log_level) +transformers.utils.logging.enable_default_handler() +transformers.utils.logging.enable_explicit_format() +tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + +parser = argparse.ArgumentParser() +parser.add_argument('--lang', type=str, required=True) +parser.add_argument('--tokenizer_dir', type=str, required=True) +parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) +parser.add_argument('--vocab_size', default=130_000, type=int) +parser.add_argument('--extend_vocab', action='store_true') +args = parser.parse_args() +lang = args.lang +if args.extend_vocab: + assert args.vocab_size < 100_000 + +raw_datasets = load_dataset( + "oscar", + f"unshuffled_deduplicated_{lang}", + cache_dir=args.hf_cache_dir +) + +print(f"✅ Loaded raw_datasets OSCAR language {lang}") + +def batch_iterator(): + batch_size = 1000 + for i in range(0, len(raw_datasets), batch_size): + yield raw_datasets['train'][i : i + batch_size]["text"] + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +assert tokenizer.is_fast +new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) +print("✅ Trained tokenizer with len ", len(new_tokenizer)) + +new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_tokenizer_{'full' if not args.extend_vocab else str(args.vocab_size)}") +print(f"✅ Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_tokenizer_{'full' if not args.extend_vocab else str(args.vocab_size)}") \ No newline at end of file From 2574ac920aac2cc94ce23801530f92562881d40f Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 28 Mar 2022 21:42:21 -0400 Subject: [PATCH 051/142] update --- scripts/lang_adapt/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index eeb7ccd..054e8ed 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -11,4 +11,4 @@ First, run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - otherwise, we save the tokenizer as `{lang}_oscar_tokenizer_full` Then, -- use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. \ No newline at end of file +- use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. \ No newline at end of file From 895cd76f84eeb520c3638dbb457c66412346f278 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 28 Mar 2022 22:36:49 -0400 Subject: [PATCH 052/142] update --- scripts/lang_adapt/README.md | 3 +- scripts/lang_adapt/run_clm_adpt.sh | 77 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 scripts/lang_adapt/run_clm_adpt.sh diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index 054e8ed..c9f96b8 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -11,4 +11,5 @@ First, run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - otherwise, we save the tokenizer as `{lang}_oscar_tokenizer_full` Then, -- use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. \ No newline at end of file +- use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. +- use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. \ No newline at end of file diff --git a/scripts/lang_adapt/run_clm_adpt.sh b/scripts/lang_adapt/run_clm_adpt.sh new file mode 100644 index 0000000..e1f966d --- /dev/null +++ b/scripts/lang_adapt/run_clm_adpt.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 +#SBATCH --array=200,500 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-020-run_clm_adpt + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_adpt_%a.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_adpt_%a.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_lang_adapter/bin/activate + + +# axis +LANG="th" +MAX_TRAIN_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) +BIGS_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +ADPT_REDUCTION_FACTOR=16 + +tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_full" +cache_dir="/users/zyong2/data/zyong2/huggingface/" +output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_adpt_${MAX_TRAIN_SAMPLES}samples" +logging_dir="/users/zyong2/data/zyong2/bigscience/data/reports/020/${LANG}_adpt_${MAX_TRAIN_SAMPLES}samples" +mkdir -p $output_dir +mkdir -p $logging_dir + +adapter_config="pfeiffer+inv" +python $FP_BIGS/scripts/exp-020/madx_run_clm.py \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_$LANG" \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 5000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --save_steps 25000 \ + --save_strategy "steps" \ + --max_train_samples $MAX_TRAIN_SAMPLES \ + --max_steps 50000 \ + --train_adapter \ + --lang_adapt_strategies "emb-and-adpt" \ + --embedding_strategies "replace" \ + --adapter_reduction_factor $ADPT_REDUCTION_FACTOR \ + --adapter_config ${adapter_config} \ + --language $LANG \ No newline at end of file From 2bef084c0494b55f1acbd62d3d7f7f26bae4cf33 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 28 Mar 2022 22:38:02 -0400 Subject: [PATCH 053/142] update README --- scripts/lang_adapt/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index c9f96b8..e18d6b1 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -1,7 +1,7 @@ # README ### Tokenizer and Tokenization of Dataset -First, run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. +Run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - `lang`: language name (e.g., "de", "th") - `tokenizer_dir`: path directory to save the tokenizer. - `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. @@ -10,6 +10,6 @@ First, run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - if `--extend_vocab`, we save the tokenizer as `{lang}_oscar_tokenizer_{vocab_size}` - otherwise, we save the tokenizer as `{lang}_oscar_tokenizer_full` -Then, +### Language Adaptation (6 Combinations) - use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. - use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. \ No newline at end of file From 6c5c05a72b8fb792b8cb7395f288b6da27dd80ec Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 29 Mar 2022 12:38:45 -0400 Subject: [PATCH 054/142] update --- scripts/lang_adapt/README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index e18d6b1..afc084d 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -3,13 +3,11 @@ ### Tokenizer and Tokenization of Dataset Run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - `lang`: language name (e.g., "de", "th") -- `tokenizer_dir`: path directory to save the tokenizer. +- `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `{lang}_oscar_tokenizer_{vocab_size}` - `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. - `vocab_size`: vocab size of the tokenizer -- `extend_vocab`: whether we are extending the vocabulary of the embedding layer (determines the saved name of the tokenizer and for sanity check purpose with vocab_size) - - if `--extend_vocab`, we save the tokenizer as `{lang}_oscar_tokenizer_{vocab_size}` - - otherwise, we save the tokenizer as `{lang}_oscar_tokenizer_full` ### Language Adaptation (6 Combinations) - use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. -- use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. \ No newline at end of file +- use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. + - Hack: after `trainer.save_model()`, manually save the `wte` and `wpe` weights. \ No newline at end of file From 1c701c540dd35d3d86a276aaf848bcb8c17bcd1d Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 4 Apr 2022 18:43:11 -0400 Subject: [PATCH 055/142] update xnli evaluation --- scripts/xnli/README.md | 80 ++++++++ .../archive_xnli.py} | 58 +++--- .../xnli_v2.py} | 107 ++++------ .../xnli_eval_de/adapters_xnli_de_tr1.3B.sh | 60 ------ .../adapters_xnli_de_tr1.3B_0shot.sh | 61 ------ .../adapters_xnli_de_tr1.3B_emb>adpt.sh | 58 ------ .../adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh | 59 ------ .../adapters_xnli_de_tr1.3B_wpe.sh | 60 ------ .../adapters_xnli_de_tr1.3B_wpe_0shot.sh | 61 ------ scripts/xnli_eval_de/xnli_de.py | 194 ------------------ scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh | 57 ----- .../xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh | 62 ------ 12 files changed, 148 insertions(+), 769 deletions(-) create mode 100644 scripts/xnli/README.md rename scripts/{xnli_eval_de/adapters_xnli_de.py => xnli/archive_xnli.py} (82%) rename scripts/{xnli_eval_de/adapters_xnli_de_wpe.py => xnli/xnli_v2.py} (63%) delete mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh delete mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh delete mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh delete mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh delete mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh delete mode 100644 scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh delete mode 100644 scripts/xnli_eval_de/xnli_de.py delete mode 100644 scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh delete mode 100644 scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh diff --git a/scripts/xnli/README.md b/scripts/xnli/README.md new file mode 100644 index 0000000..f368439 --- /dev/null +++ b/scripts/xnli/README.md @@ -0,0 +1,80 @@ +# XNLI Evaluation + +Use `xnli_v2.py` to run the evaluation on XNLI. + +### With Language Adapters +``` +LANG="th" +CACHE_DIR="/users/zyong2/data/zyong2/huggingface/" +lr=5e-5 + +# Original BigScience model and language-specific tokenizer +MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-ckpt118500" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_24000" + +# saved language adapters +MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/oscar_th" + +# saved embedding layers +WTE="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/transformer.wte.weight.pt" +WPE="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/transformer.wpe.weight.pt" + +# output directory +OUTPUT_DIR="$FP_BIGS/data/processed/021/xnli_th_adpt_100000samples" + +mkdir -p $OUTPUT_DIR + +# remove --zero_shot for supervised finetuning setting; otherwise, it will be cross-lingual finetuning setting. +# use --use_partial_data to test the code + +python xnli_v2.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $lr \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--wte $WTE \ +--wpe $WPE \ +--zero_shot +``` + +### Embedding only approach (No Language Adapters) +``` +LANG="th" +CACHE_DIR="/users/zyong2/data/zyong2/huggingface/" +lr=5e-5 + +# Saved finetuned model and language-specific tokenizer +MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_emb_100000samples" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_24000" + +# output directory +OUTPUT_DIR="$FP_BIGS/data/processed/021/xnli_th_adpt_100000samples" + +mkdir -p $OUTPUT_DIR + +# remove --zero_shot for supervised finetuning setting; otherwise, it will be cross-lingual finetuning setting. +# use --use_partial_data to test the code + +python xnli_v2.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $lr \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--zero_shot \ +--use_partial_data +``` diff --git a/scripts/xnli_eval_de/adapters_xnli_de.py b/scripts/xnli/archive_xnli.py similarity index 82% rename from scripts/xnli_eval_de/adapters_xnli_de.py rename to scripts/xnli/archive_xnli.py index 8ce10a0..24aed27 100644 --- a/scripts/xnli_eval_de/adapters_xnli_de.py +++ b/scripts/xnli/archive_xnli.py @@ -36,10 +36,8 @@ parser.add_argument("--use_partial_data", default=False, action="store_true") parser.add_argument("--zero_shot", default=False, action="store_true") -finetune_strategies = ["whole", "lang_adapters", "task_adapters"] -parser.add_argument("--madx_lang_adapter", required=True) +parser.add_argument("--madx_lang_adapter") parser.add_argument("--adapter_lang_name", required=True) -parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) args = parser.parse_args() if args.do_eval_after_train: @@ -100,9 +98,6 @@ def en_tokenize_function(examples): small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - from datasets import load_metric metric = load_metric("xnli") @@ -111,7 +106,6 @@ def compute_metrics(eval_pred): predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) - training_args = TrainingArguments( args.output_dir, overwrite_output_dir=True, @@ -132,9 +126,7 @@ def compute_metrics(eval_pred): ) def load_model(args, inference=False): - - # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one - # even when we call load_adapter + # for adapters, when we load with GPT2ForSequenceClassification, the embeddings are the original model if args.zero_shot and not inference: model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, num_labels=3, @@ -146,52 +138,49 @@ def load_model(args, inference=False): pad_token_id=tokenizer.pad_token_id, cache_dir=args.cache_dir) - if not args.zero_shot or (args.zero_shot and inference): + # this part is to replace the embedding layer + if args.madx_lang_adapter and (not args.zero_shot or (args.zero_shot and inference)): # if not zero shot, that means that we need to replace the embedding layers during training # we also need to replace embedding layers during inference causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) - # change the embedding layer of the original big science model + # change the embedding layer of the original big science model # by loading the adapters (which has saved lm_head) causal_lm_model.resize_token_embeddings(len(tokenizer)) causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - # model has original bigscience embedding so replace it. model.resize_token_embeddings(len(tokenizer)) model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] if not inference: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv", - load_as=args.adapter_lang_name) - if args.finetune_strategies == "whole": - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "lang_adapters": - model.train_adapter([args.adapter_lang_name]) - elif args.finetune_strategies == "task_adapters": - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - else: - raise ValueError("Lack configuration") + if not args.zero_shot and args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") print("🔥 ==================== Training: ==================== 🔥") + print(model) for name, param in model.named_parameters(): if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") else: print(f"🚀 Trainable layer '{name}'") - print(model) else: print("🔥 ==================== Inference: ==================== 🔥") - assert args.pretrained_adapters_dir - if args.finetune_strategies == "lang_adapters": - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "task_adapters": - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(args.madx_lang_adapter) model.set_active_adapters(adapter_name) adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") model.set_active_adapters(adapter_name) + else: + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + + # for TGT -> TGT supervised finetuning setting, change adapter_name + adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + model.set_active_adapters(adapter_name) print(model) return model @@ -216,8 +205,9 @@ def load_model(args, inference=False): for checkpoint_dir in os.listdir(args.output_dir) if checkpoint_dir.startswith('checkpoint-') ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") model = load_model(args, inference=True) training_args.report_to = list() diff --git a/scripts/xnli_eval_de/adapters_xnli_de_wpe.py b/scripts/xnli/xnli_v2.py similarity index 63% rename from scripts/xnli_eval_de/adapters_xnli_de_wpe.py rename to scripts/xnli/xnli_v2.py index 82a673e..1887e83 100644 --- a/scripts/xnli_eval_de/adapters_xnli_de_wpe.py +++ b/scripts/xnli/xnli_v2.py @@ -9,7 +9,7 @@ import torch import numpy as np from transformers import TrainingArguments, Trainer, AdapterTrainer -from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM, GPT2Config +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM # setup logging import sys @@ -29,18 +29,16 @@ parser.add_argument("--gradient_accumulation_steps", type=int, default=4) parser.add_argument("--pretrained_model") parser.add_argument("--original_model") +parser.add_argument("--wte") +parser.add_argument("--wpe") parser.add_argument("--tokenizer") +parser.add_argument("--madx_lang_adapter") parser.add_argument("--do_train", default=False, action="store_true") parser.add_argument("--do_eval_after_train", default=False, action="store_true") parser.add_argument("--do_predict", default=False, action="store_true") parser.add_argument("--use_partial_data", default=False, action="store_true") parser.add_argument("--zero_shot", default=False, action="store_true") -finetune_strategies = ["whole", "lang_adapters", "task_adapters"] -parser.add_argument("--madx_lang_adapter", required=True) -parser.add_argument("--adapter_lang_name", required=True) -parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) - args = parser.parse_args() if args.do_eval_after_train: args.do_predict = True @@ -100,9 +98,6 @@ def en_tokenize_function(examples): small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - from datasets import load_metric metric = load_metric("xnli") @@ -111,7 +106,6 @@ def compute_metrics(eval_pred): predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) - training_args = TrainingArguments( args.output_dir, overwrite_output_dir=True, @@ -132,66 +126,52 @@ def compute_metrics(eval_pred): ) def load_model(args, inference=False): - - # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one - # even when we call load_adapter - print(args) - + # for adapters, when we load with GPT2ForSequenceClassification, the embeddings are the original model if args.zero_shot and not inference: model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - original_config = GPT2Config.from_pretrained(args.original_model) - original_model = GPT2ForSequenceClassification.from_pretrained(args.original_model, num_labels=3) - - # replace the embedding layer with original (contains-en) embedding. - logger.info("👉 Replace with en-langauge embedding") - model.resize_token_embeddings(original_config.vocab_size) - - model._modules['transformer']._modules['wte'] = original_model._modules['transformer']._modules['wte'] - model._modules['transformer']._modules['wpe'] = original_model._modules['transformer']._modules['wpe'] - logger.info(f"👉 Embedding (wte) changes to {model._modules['transformer']._modules['wte']}") - logger.info(f"👉 Embedding (wte) changes to {model._modules['transformer']._modules['wpe']}") + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) else: model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=tokenizer.pad_token_id, - cache_dir=args.cache_dir) + num_labels=3, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + # this part is to replace the embedding layer + if not args.zero_shot or (args.zero_shot and inference): + if args.wpe: + wpe = torch.load(args.wpe) + model._modules['transformer']._modules['wpe'].weight.data = wpe + logger.info(f"Loaded wpe from {args.wpe}") + if args.wte: + wte = torch.load(args.wte) + model._modules['transformer']._modules['wte'].weight.data = wte + logger.info(f"Loaded wte from {args.wte}") if not inference: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv", - load_as=args.adapter_lang_name) - if args.finetune_strategies == "whole": - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "lang_adapters": - model.train_adapter([args.adapter_lang_name]) - elif args.finetune_strategies == "task_adapters": - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - else: - raise ValueError("Lack configuration") + if not args.zero_shot and args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv") + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") print("🔥 ==================== Training: ==================== 🔥") + print(model) for name, param in model.named_parameters(): if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") else: print(f"🚀 Trainable layer '{name}'") - print(model) else: print("🔥 ==================== Inference: ==================== 🔥") - assert args.pretrained_adapters_dir - if args.finetune_strategies == "lang_adapters": - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "task_adapters": - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + assert args.pretrained_adapters_dir + if args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter) model.set_active_adapters(adapter_name) + + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) print(model) return model @@ -210,14 +190,13 @@ def load_model(args, inference=False): trainer.train() if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained task adapters from {args.pretrained_adapters_dir}") model = load_model(args, inference=True) training_args.report_to = list() @@ -229,4 +208,6 @@ def load_model(args, inference=False): compute_metrics=compute_metrics ) - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file + result = trainer.evaluate() + + print("Evaluate on Test:", result) \ No newline at end of file diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh deleted file mode 100644 index a7d2507..0000000 --- a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -#SBATCH --array=16,48,384 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-013-adapters_xnli_de_tr1.3B - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_%a.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_%a.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_100K_adpt_${SLURM_ARRAY_TASK_ID}/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-pfeiffer+inv-${SLURM_ARRAY_TASK_ID}/oscar_de" - FT_STRATEGIES="task_adapters" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_de_100K_adpt_${SLURM_ARRAY_TASK_ID}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-de" \ - --finetune_strategies $FT_STRATEGIES -done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh deleted file mode 100644 index 3e65669..0000000 --- a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_0shot.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -#SBATCH --array=16,48,384 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_0shot_%a.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_0shot_%a.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_100K_adpt_${SLURM_ARRAY_TASK_ID}/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-pfeiffer+inv-${SLURM_ARRAY_TASK_ID}/oscar_de" - FT_STRATEGIES="task_adapters" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_de_100K_adpt_${SLURM_ARRAY_TASK_ID}_0shot" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-de" \ - --finetune_strategies $FT_STRATEGIES \ - --zero_shot -done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh deleted file mode 100644 index ddc23d3..0000000 --- a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_emb>adpt - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_50000" - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/012/madx-bigs-de-50000-50000-48/oscar_de" - FT_STRATEGIES="task_adapters" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_de_100K_adpt_${SLURM_ARRAY_TASK_ID}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-de" \ - --finetune_strategies $FT_STRATEGIES -done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh deleted file mode 100644 index c52b12f..0000000 --- a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_emb>adpt_0shot.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-013-adapters_xnli_detr1.3B_emb>adpt_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_emb>adpt_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_50000" - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/012/madx-bigs-de-50000-50000-48/oscar_de" - FT_STRATEGIES="task_adapters" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_de_50K>50K_adpt_0shot" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-de" \ - --finetune_strategies $FT_STRATEGIES \ - --zero_shot -done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh deleted file mode 100644 index 5f7e725..0000000 --- a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_wpe - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - ORIGINAL_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" - MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/pretrained_model" - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/oscar_de" - FT_STRATEGIES="task_adapters" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_de_wpe_adpt_0shot" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de_wpe.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --original_model $ORIGINAL_MODEL \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-de" \ - --finetune_strategies $FT_STRATEGIES -done diff --git a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh b/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh deleted file mode 100644 index 4666eb1..0000000 --- a/scripts/xnli_eval_de/adapters_xnli_de_tr1.3B_wpe_0shot.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-013-adapters_xnli_de_tr1.3B_wpe_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/adapters_xnli_de_tr1.3B_wpe_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - ORIGINAL_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" - MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/pretrained_model" - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/interim/de_wpe/tmp-network/user/vnikouli/Projects/bigscience/exp-009/madx-bs1b3-multi-ch118500-de-sample100000/withlngembft-lmhead-peft-pfeiffer+inv-16-withpretainedmodel/oscar_de" - FT_STRATEGIES="task_adapters" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_de_wpe_adpt_0shot" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/adapters_xnli_de_wpe.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --original_model $ORIGINAL_MODEL \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-de" \ - --finetune_strategies $FT_STRATEGIES \ - --zero_shot -done diff --git a/scripts/xnli_eval_de/xnli_de.py b/scripts/xnli_eval_de/xnli_de.py deleted file mode 100644 index f88ce76..0000000 --- a/scripts/xnli_eval_de/xnli_de.py +++ /dev/null @@ -1,194 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import AutoTokenizer, GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--original_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -print("Arguments: ========") -print(args) - -# load dataset -if args.zero_shot: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: - en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer - en_tokenizer.pad_token = en_tokenizer.eos_token - - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -logger.info("Tokenizing the dataset...") -if args.zero_shot: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(pretrained_model, cache_dir, pad_token_id=0): - return GPT2ForSequenceClassification.from_pretrained(pretrained_model, - num_labels=3, - pad_token_id=pad_token_id, - cache_dir=cache_dir) - - -if args.do_train: - logger.info("Start Training") - model = load_model(args.pretrained_model, - args.cache_dir, - en_tokenizer.pad_token_id if args.zero_shot else tokenizer.pad_token_id) - - if args.zero_shot: - # model is the finetuned model - original_config = GPT2Config.from_pretrained(args.original_model) - original_model = load_model(args.original_model, args.cache_dir) - no_en_wte = model._modules['transformer']._modules['wte'] - no_en_wpe = model._modules['transformer']._modules['wpe'] - - # replace the embedding layer with original (contains-en) embedding. - logger.info("👉 Replace with en-langauge embedding") - model.resize_token_embeddings(original_config.vocab_size) - - model._modules['transformer']._modules['wte'] = original_model._modules['transformer']._modules['wte'] - model._modules['transformer']._modules['wpe'] = original_model._modules['transformer']._modules['wpe'] - logger.info(f"👉 Embedding (wte) changes from {no_en_wte} to {model._modules['transformer']._modules['wte']}") - logger.info(f"👉 Embedding (wte) changes from {no_en_wpe} to {model._modules['transformer']._modules['wpe']}") - - trainer = Trainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - # FIXME: hack for now because the tokenizer loaded from bigscience doesn't have the same - # vocab size as indicated in the config.json - # not optimal fix for now because cooriginal_confignfig can be directly passed to from_pretrained - if args.zero_shot: - original_config.save_pretrained(args.pretrained_model) - - model = load_model(args.pretrained_model, args.cache_dir, tokenizer.pad_token_id) - if args.zero_shot: - # replace with target-language embedding. - logger.info("👉 Replace with target-language embedding") - logger.info(f"👉 len(tokenizer) = {len(tokenizer)}") - model.resize_token_embeddings(len(tokenizer)) - model._modules['transformer']._modules['wte'] = no_en_wte - model._modules['transformer']._modules['wpe'] = no_en_wpe - - training_args.report_to = list() - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh b/scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh deleted file mode 100644 index 4ef57e1..0000000 --- a/scripts/xnli_eval_de/xnli_de_ft_tr1.3B.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-013-xnli_de_ft_tr1.3B - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 5e-5 ) - -# following https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification#fine-tuning-on-xnli -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - # MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_100000" - MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" - # TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - TOKENIZER_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-013/xnli_${LANG}_ft_de_100000" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh b/scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh deleted file mode 100644 index a6700fd..0000000 --- a/scripts/xnli_eval_de/xnli_de_ft_tr1.3B_0shot.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-013-xnli_de_ft_tr1.3B_0shot_ori - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B_0shot_ori.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-013/xnli_de_ft_tr1.3B_0shot_ori.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 5e-5 ) - -# following https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification#fine-tuning-on-xnli -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/de_100000" - # MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" # cross-lingual 0-shot (for BigScience original) - - #TOKENIZER_NAME affects the tokenization of test set - TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - # TOKENIZER_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" # cross-lingual 0-shot (for BigScience original) - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/013/xnli_${LANG}_ft_0shot_de_100000_ori" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-013/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 2 \ - --learning_rate $lr \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --zero_shot \ - --original_model "/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" -done From 76815fa9b45dce7fd62507f84308796a16a1aa30 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:22:36 +0200 Subject: [PATCH 056/142] added script to train tokenizer only on a subset of the dataset --- scripts/lang_adapt/tokenized4clm_sampled.py | 67 ++ .../madx_exp/madxlastlayer_lngembft_clm.py | 618 ++++++++++++++++++ 2 files changed, 685 insertions(+) create mode 100644 scripts/lang_adapt/tokenized4clm_sampled.py create mode 100644 scripts/madx_exp/madxlastlayer_lngembft_clm.py diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py new file mode 100644 index 0000000..672277a --- /dev/null +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -0,0 +1,67 @@ +import torch +import datasets +import transformers +from transformers import AutoTokenizer +from datasets import load_dataset +import pathlib + +import argparse +import sys + +import logging +logger = logging.getLogger(__name__) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) +log_level = -1 +logger.setLevel(log_level) +datasets.utils.logging.set_verbosity(log_level) +transformers.utils.logging.set_verbosity(log_level) +transformers.utils.logging.enable_default_handler() +transformers.utils.logging.enable_explicit_format() +tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + +parser = argparse.ArgumentParser() +parser.add_argument('--lang', type=str, required=True) +parser.add_argument('--tokenizer_dir', type=str, required=True) +parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) +parser.add_argument('--vocab_size', default=130_000, type=int) +parser.add_argument('--extend_vocab', action='store_true') +parser.add_argument('--sample_size', default=None, type=int) + +args = parser.parse_args() +lang = args.lang +if args.extend_vocab: + assert args.vocab_size < 100_000 + +if args.sample_size: + raw_datasets = load_dataset( + "oscar", + f"unshuffled_deduplicated_{lang}", + cache_dir=args.hf_cache_dir + )["train"].shuffle(seed=42).select(range(args.sample_size)) + +else: + raw_datasets = load_dataset( + "oscar", + f"unshuffled_deduplicated_{lang}", + cache_dir=args.hf_cache_dir + )["train"] + +print(f"✅ Loaded raw_datasets OSCAR language {lang}") + +def batch_iterator(): + batch_size = 1000 + for i in range(0, len(raw_datasets), batch_size): + yield raw_datasets[i : i + batch_size]["text"] + +tokenizer = AutoTokenizer.from_pretrained("gpt2") +assert tokenizer.is_fast +new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) +print("✅ Trained tokenizer with len ", len(new_tokenizer)) + +new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") +print(f"✅ Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") diff --git a/scripts/madx_exp/madxlastlayer_lngembft_clm.py b/scripts/madx_exp/madxlastlayer_lngembft_clm.py new file mode 100644 index 0000000..7234cea --- /dev/null +++ b/scripts/madx_exp/madxlastlayer_lngembft_clm.py @@ -0,0 +1,618 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def load_tokenizer(model_args): + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + return tokenizer + + + +def load_data(data_args, model_args): + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + + if "validation" not in raw_datasets.keys(): + if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + elif data_args.max_eval_samples is not None : + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + else: + raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) + + raw_datasets['validation'] = raw_datasets['test'] + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + return raw_datasets + +def load_model(model_args, tokenizer): + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + #TODO: remap embedding parameters + #if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + model.resize_token_embeddings(len(tokenizer)) + return model + +def preprocess_data(training_args, data_args, model_args, tokenizer): + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") + + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info("Sanity check: loaded tokenized_datasets") + else: + raw_datasets = load_data(data_args, model_args) + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + + text_column_name = "text" if "text" in column_names else column_names[0] + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info("Sanity check: saved tokenized_datasets") + if "train" not in tokenized_datasets and training_args.do_train: + raise ValueError("--do_train requires a train dataset") + if "validation" not in tokenized_datasets and training_args.do_eval: + raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets + + +def get_lm_dataset(training_args, data_args, model_args, tokenizer): + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("Sanity check: loaded lm_datasets") + else: + + tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("Sanity check: saved lm_datasets") + return lm_datasets + +def add_adapters(adapter_args, data_args, model): + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + leave_out = [i for i in range(0,23)] + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + trainable_params = 0 + frozen_params = 0 + emb_params = 0 + for name, param in model.named_parameters(): + if not param.requires_grad: + if not "wte" in name and not "lm_head" in name: + print(f"🥶 Frozen layer '{name}'") + frozen_params +=param.numel() + else: + param.requires_grad = True + print(f"🚀 Trainable layer '{name}'") + emb_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + print(f"Total frozen parameters: {frozen_params}") + print(f"Total emb parameters: {emb_params}") + print(f"Total trainable parameters: {trainable_params}") + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + training_args.data_dir = f'{training_args.output_dir}/../' + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + pass + #raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + #) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + tokenizer = load_tokenizer(model_args) + model = load_model(model_args, tokenizer) + + add_adapters(adapter_args, data_args, model) + # Preprocessing the datasets. + lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) + if training_args.do_train: + train_dataset = lm_datasets["train"] + + if training_args.do_eval: + + eval_dataset = lm_datasets["validation"] + + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + +# if training_args.push_to_hub: +# trainer.push_to_hub(**kwargs) +# else: +# trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() From 5012f7f843ae83ef915ffd02e1ce80c9a4f83e4b Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:23:56 +0200 Subject: [PATCH 057/142] added script to train tokenizer only on a subset of the dataset --- scripts/lang_adapt/train_tokenizer.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 scripts/lang_adapt/train_tokenizer.sh diff --git a/scripts/lang_adapt/train_tokenizer.sh b/scripts/lang_adapt/train_tokenizer.sh new file mode 100644 index 0000000..7a95182 --- /dev/null +++ b/scripts/lang_adapt/train_tokenizer.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +lng=$1 +sample_size=$2 +vocab_size=$3 +source $bs_dir/multilingual-modeling/scripts/env/bin/activate +python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size + From 55f62fde76dc70f50799fd68f7c659f49f548a74 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:27:34 +0200 Subject: [PATCH 058/142] updated instructions for samples tokenizer --- scripts/lang_adapt/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index afc084d..f2ed6c2 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -7,6 +7,14 @@ Run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. - `vocab_size`: vocab size of the tokenizer + +Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dataset. +- `lang`: language name (e.g., "de", "th") +- `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `{lang}_oscar_tokenizer_{vocab_size}` +- `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. +- `vocab_size`: vocab size of the tokenizer +- `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) + ### Language Adaptation (6 Combinations) - use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. - use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. From 4ce7678d7daba71df1e9f93c45856baeeb81cc6c Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:29:52 +0200 Subject: [PATCH 059/142] updated training script: added some extra parameters in the running script, and changed the slurm running settings --- scripts/lang_adapt/run_clm_adpt_vn.sh | 83 +++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 scripts/lang_adapt/run_clm_adpt_vn.sh diff --git a/scripts/lang_adapt/run_clm_adpt_vn.sh b/scripts/lang_adapt/run_clm_adpt_vn.sh new file mode 100644 index 0000000..44d12af --- /dev/null +++ b/scripts/lang_adapt/run_clm_adpt_vn.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J run_clm_madx + +# Specify an output file +#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_madx-%j.out +#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_madx-%j.err + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +data_sample=$1 +ch=118500 +lng=$2 +adapter_reduction_factor=$3 +dataset=oscar +adapter_config="pfeiffer+inv" +vocabsize=24000 +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +tokenizer_dir="${FP_BIGS}/tokenizers/${lng}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +cache_dir="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${lng}/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}-$( basename $tokenizer_dir )" +data_tok_dir=${data_dir}/lng_tok + +output_dir="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +logging_dir="${FP_BIGS}/logs/exp-ext-${lng}/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}-$( basename $tokenizer_dir )/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +echo $output_dir + +BIGS_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name + + +mkdir -p $output_dir +mkdir -p $logging_dir + +adapter_config="pfeiffer+inv" +python $FP_BIGS/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ + --seed 0 \ + --fp16 \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_${lng}" \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "epoch" \ + --max_eval_samples 5000 \ + --save_steps 10000 \ + --save_strategy "steps" \ + --save_total_limit 3 \ + --max_train_samples $data_sample \ + --max_steps 50000 \ + --train_adapter \ + --load_best_model_at_end \ + --lang_adapt_strategies "emb-and-adpt" \ + --embedding_strategies "overlap-replace" \ + --adapter_reduction_factor $adapter_reduction_factor \ + --adapter_config ${adapter_config} \ + --language $lng From 5cadd4341e31ca73704363230f86b107e101116f Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 22 Apr 2022 17:35:34 +0200 Subject: [PATCH 060/142] added overlap-replace parameter, added possibility to save embedding layer (instead of whole model), added early stopping, --- scripts/lang_adapt/madx_run_clm.py | 80 +++++++++++++++++------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index bcea14c..de46184 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -16,6 +16,8 @@ from datasets import load_dataset import transformers +from transformers import EarlyStoppingCallback + import transformers.adapters.composition as ac from transformers import ( CONFIG_MAPPING, @@ -105,7 +107,7 @@ class ModelArguments: ) embedding_strategies: str = field( default="", - metadata={"help": "choose one of the two strategies - 'replace', 'extend'"}, + metadata={"help": "choose one of the two strategies - 'replace', 'extend', 'overlap-replace'"}, ) def __post_init__(self): @@ -242,9 +244,9 @@ def load_data(data_args, model_args): if "validation" not in raw_datasets.keys(): if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: - raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples*2, test_size = data_args.max_eval_samples*2) elif data_args.max_eval_samples is not None : - raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples*2) else: raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) raw_datasets['validation'] = raw_datasets['test'] @@ -256,12 +258,12 @@ def load_data(data_args, model_args): # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. - - if data_args.max_train_samples is not None: + if data_args.max_train_samples is not None and len(raw_datasets['train']) > data_args.max_train_samples: # FIXME: currently assume the loaded checkpoint is trained with the first data_args.max_train_samples number of samples - raw_datasets["train"] = raw_datasets["train"].filter(lambda example, indice: indice < data_args.max_train_samples, with_indices=True) + #raw_datasets["train"] = raw_datasets["train"].filter(lambda example, indice: indice < data_args.max_train_samples, with_indices=True) raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) - if data_args.max_eval_samples is not None: + + if data_args.max_eval_samples is not None and len(raw_datasets['validation']) > data_args.max_eval_samples: raw_datasets["validation"] = raw_datasets["validation"].select(range(data_args.max_eval_samples)) return raw_datasets @@ -297,16 +299,12 @@ def load_model(model_args, tokenizer): logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") #TODO: remap embedding parameters - #if not tokenizer.name_or_path == model_args.model_name_or_path: - # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - - model.resize_token_embeddings(len(tokenizer)) + return model def preprocess_data(training_args, data_args, model_args, tokenizer): with training_args.main_process_first(desc="dataset map tokenization"): saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_data.pt") - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) logger.info(f"✅ loaded tokenized_data") @@ -314,7 +312,7 @@ def preprocess_data(training_args, data_args, model_args, tokenizer): raw_datasets = load_data(data_args, model_args) assert len(raw_datasets['train']) == data_args.max_train_samples logger.info(f"🧠 Sanity check: loaded raw datasets have {data_args.max_train_samples} samples") - + # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names @@ -343,8 +341,10 @@ def tokenize_function(examples): load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) logger.info(f"✅ saved tokenized_data") + if "train" not in tokenized_datasets and training_args.do_train: raise ValueError("--do_train requires a train dataset") if "validation" not in tokenized_datasets and training_args.do_eval: @@ -408,17 +408,18 @@ def group_texts(examples): ) torch.save(lm_datasets, saved_lm_datasets_fp) logger.info("✅ saved lm_data") - print(lm_datasets) return lm_datasets -def modify_model(adapter_args, data_args, model_args, model): - if model_args.lang_adapt_strategies == "emb": - for name, param in model.named_parameters(): - if "wte" not in name and "wpe" not in name: - param.requires_grad = False +def modify_model(adapter_args, data_args, model_args, tokenizer, model): + #if "emb" in model_args.lang_adapt_strategies: + # if "replace" in model_args.embedding_strategies: + # for name, param in model.named_parameters(): + # if "wte" not in name and "wpe" not in name and "lm_head" not in name: + # param.requires_grad = False + # Setup adapters - elif adapter_args.train_adapter: + if adapter_args.train_adapter: task_name = data_args.dataset_name or "clm" task_name += f"_{adapter_args.language}" # check if adapter already exists, otherwise add it @@ -456,18 +457,29 @@ def modify_model(adapter_args, data_args, model_args, model): else: lang_adapter_name = None # Freeze all model weights except of those of this adapter - model.train_adapter([task_name]) + model.train_adapter(task_name, train_embeddings=True) # Set the adapters to be used in every forward pass - if lang_adapter_name: - model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) - else: - model.set_active_adapters(task_name) + #if lang_adapter_name: + # model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + #else: + # model.set_active_adapters(task_name) + else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training" ) + + if model_args.embedding_strategies == "overlap-replace": + if not tokenizer.name_or_path == model_args.model_name_or_path: + orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + model.add_embeddings('lng_emb', tokenizer, reference_embedding='default', reference_tokenizer=orig_tokenizer ) + model._active_embedding = "lng_emb" + model.delete_embeddings('default') + model.tie_weights() + elif model_args.embedding_strategies == "replace": + model.resize_token_embeddings(len(tokenizer)) trainable_params = 0 frozen_params = 0 emb_params = 0 @@ -478,7 +490,7 @@ def modify_model(adapter_args, data_args, model_args, model): else: print(f"🚀 Trainable layer '{name}'") trainable_params += param.numel() - + if "wte" and "wpe" in name: emb_params += param.numel() @@ -504,7 +516,7 @@ def main(): training_args.data_dir = f'{training_args.output_dir}' assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') - assert model_args.embedding_strategies in ('replace', 'extend') + assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') # Setup logging logging.basicConfig( @@ -551,8 +563,7 @@ def main(): tokenizer = load_tokenizer(model_args) model = load_model(model_args, tokenizer) - - modify_model(adapter_args, data_args, model_args, model) + modify_model(adapter_args, data_args, model_args, tokenizer, model) # Preprocessing the datasets. lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) if training_args.do_train: @@ -560,8 +571,6 @@ def main(): if training_args.do_eval: eval_dataset = lm_datasets["validation"] - - # Initialize our Trainer trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer trainer = trainer_class( @@ -571,7 +580,7 @@ def main(): eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, + data_collator=default_data_collator ) logger.info(model) @@ -583,8 +592,11 @@ def main(): checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint + trainer.add_callback(EarlyStoppingCallback(3)) train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload + trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) + + trainer.model.save_embeddings(f'{trainer.args.output_dir}/embedding_layer') metrics = train_result.metrics @@ -635,4 +647,4 @@ def _mp_fn(index): if __name__ == "__main__": - main() \ No newline at end of file + main() From e619e73542a9f56687e75a0903f1368dcfe3edac Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 2 May 2022 14:47:38 -0400 Subject: [PATCH 061/142] update eval --- scripts/eval_xnli/README.md | 38 ++++ scripts/eval_xnli/adapters_xnli_de.py | 244 ++++++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 scripts/eval_xnli/README.md create mode 100644 scripts/eval_xnli/adapters_xnli_de.py diff --git a/scripts/eval_xnli/README.md b/scripts/eval_xnli/README.md new file mode 100644 index 0000000..fd349b0 --- /dev/null +++ b/scripts/eval_xnli/README.md @@ -0,0 +1,38 @@ +# XNLI (Cross-Lingual and Supervised Setting) + +Current scripts are for XNLI (German). + +``` +OUTPUT_DIR=... # where you want to save checkpoints at +LANG="de" +CACHE_DIR=... # cache dir for saving/loading HF models and XNLI datasets. +LR=1e-5 +MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME=".../oscar_de" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +mkdir -p $OUTPUT_DIR +python adapters_xnli_de.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--adapter_lang_name "xnli-de" \ +--finetune_strategies $FT_STRATEGIES \ +--zero_shot +``` + +Remove `--zero_shot` for supervised finetuning setting. \ No newline at end of file diff --git a/scripts/eval_xnli/adapters_xnli_de.py b/scripts/eval_xnli/adapters_xnli_de.py new file mode 100644 index 0000000..46140aa --- /dev/null +++ b/scripts/eval_xnli/adapters_xnli_de.py @@ -0,0 +1,244 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +logger.info(full_train_dataset[0]) +logger.info(full_train_dataset[100]) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + if args.zero_shot and not inference: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + else: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if not args.zero_shot or (args.zero_shot and inference): + # if not zero shot, that means that we need to replace the embedding layers during training + # we also need to replace embedding layers during inference + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + + # change the embedding layer of the original big science model + # by loading the adapters (which has saved lm_head) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if args.madx_lang_adapter: + causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + + # model has original bigscience embedding so replace it. + model.resize_token_embeddings(len(tokenizer)) + model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + + if not inference: + if not args.zero_shot: + if args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + if args.finetune_strategies == "whole": + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "lang_adapters": + model.train_adapter([args.adapter_lang_name]) + elif args.finetune_strategies == "task_adapters": + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + else: + raise ValueError("Lack configuration") + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + print("🔥 ==================== Inference: ==================== 🔥") + if args.finetune_strategies == "lang_adapters": + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") + model.set_active_adapters(adapter_name) + elif args.finetune_strategies == "task_adapters": + if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(args.madx_lang_adapter) + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + else: + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + + # for TGT -> TGT supervised finetuning setting, change adapter_name + adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file From 7d906f61307dfabe96abbc1d830f5bc9c48e5e84 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 2 May 2022 14:49:53 -0400 Subject: [PATCH 062/142] update eval --- scripts/eval_xnli/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/eval_xnli/README.md b/scripts/eval_xnli/README.md index fd349b0..17fc051 100644 --- a/scripts/eval_xnli/README.md +++ b/scripts/eval_xnli/README.md @@ -35,4 +35,8 @@ $OUTPUT_DIR \ --zero_shot ``` -Remove `--zero_shot` for supervised finetuning setting. \ No newline at end of file +Remove `--zero_shot` for supervised finetuning setting. + +### Zero-shot Prompt-based Setting + +See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). \ No newline at end of file From 067a2a7bf9139d86099e60ed0a7294518abced3c Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 2 May 2022 14:52:29 -0400 Subject: [PATCH 063/142] clean --- scripts/exp-005/download_oscar.py | 11 - scripts/exp-005/download_oscar_de.sh | 29 - scripts/exp-005/run_clm.py | 551 --------------- scripts/exp-005/run_clm_de.sh | 56 -- scripts/exp-005/train_tokenizer_gpt2.py | 20 - scripts/exp-005/train_tokenizer_gpt2.sh | 29 - .../GPT2ForQuestionAnswering.cpython-37.pyc | Bin 5168 -> 0 bytes .../__pycache__/trainer_qa.cpython-37.pyc | Bin 2574 -> 0 bytes scripts/exp-006/xnli/adapters_xnli_de.py | 191 ----- scripts/exp-006/xnli/adapters_xnli_de_gpt2.sh | 58 -- scripts/exp-006/xnli/adapters_xnli_ko.py | 234 ------- scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh | 57 -- scripts/exp-006/xnli/xnli_de.py | 162 ----- scripts/exp-006/xnli/xnli_de_ft_gpt2.sh | 53 -- scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh | 54 -- scripts/exp-006/xnli/xnli_de_gpt2.sh | 53 -- scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh | 54 -- scripts/exp-006/xnli/xnli_ko.py | 213 ------ scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh | 52 -- scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh | 53 -- ...423e05ef7d46c9ba83614b4c527017571b9d2898b8 | 124 ---- ...5ef7d46c9ba83614b4c527017571b9d2898b8.json | 1 - ...5ef7d46c9ba83614b4c527017571b9d2898b8.lock | 0 .../exp-006/xquad/GPT2ForQuestionAnswering.py | 129 ---- .../GPT2ForQuestionAnswering.cpython-37.pyc | Bin 5202 -> 0 bytes .../__pycache__/trainer_qa.cpython-37.pyc | Bin 2580 -> 0 bytes .../xquad/__pycache__/utils_qa.cpython-37.pyc | Bin 13232 -> 0 bytes scripts/exp-006/xquad/eval_germanquad.sh | 52 -- scripts/exp-006/xquad/eval_qa.py | 655 ------------------ scripts/exp-006/xquad/trainer_qa.py | 105 --- scripts/exp-006/xquad/utils_qa.py | 431 ------------ scripts/exp-007/madx_run_clm.py | 593 ---------------- scripts/exp-007/run_clm_de.sh | 60 -- scripts/exp-007/run_clm_en.sh | 61 -- scripts/exp-007/run_clm_ko.sh | 61 -- scripts/exp-008/xnli/xnli_de.py | 151 ---- scripts/exp-008/xnli/xnli_de_mbert.sh | 53 -- scripts/exp-008/xnli/xnli_de_mbert_0shot.sh | 54 -- scripts/exp-008/xnli/xnli_ko.py | 197 ------ scripts/exp-008/xnli/xnli_ko_mbert.sh | 53 -- scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh | 54 -- 41 files changed, 4764 deletions(-) delete mode 100644 scripts/exp-005/download_oscar.py delete mode 100644 scripts/exp-005/download_oscar_de.sh delete mode 100644 scripts/exp-005/run_clm.py delete mode 100644 scripts/exp-005/run_clm_de.sh delete mode 100644 scripts/exp-005/train_tokenizer_gpt2.py delete mode 100644 scripts/exp-005/train_tokenizer_gpt2.sh delete mode 100644 scripts/exp-006/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc delete mode 100644 scripts/exp-006/__pycache__/trainer_qa.cpython-37.pyc delete mode 100644 scripts/exp-006/xnli/adapters_xnli_de.py delete mode 100644 scripts/exp-006/xnli/adapters_xnli_de_gpt2.sh delete mode 100644 scripts/exp-006/xnli/adapters_xnli_ko.py delete mode 100644 scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh delete mode 100644 scripts/exp-006/xnli/xnli_de.py delete mode 100644 scripts/exp-006/xnli/xnli_de_ft_gpt2.sh delete mode 100644 scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh delete mode 100644 scripts/exp-006/xnli/xnli_de_gpt2.sh delete mode 100644 scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh delete mode 100644 scripts/exp-006/xnli/xnli_ko.py delete mode 100644 scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh delete mode 100644 scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh delete mode 100644 scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 delete mode 100644 scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json delete mode 100755 scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.lock delete mode 100644 scripts/exp-006/xquad/GPT2ForQuestionAnswering.py delete mode 100644 scripts/exp-006/xquad/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc delete mode 100644 scripts/exp-006/xquad/__pycache__/trainer_qa.cpython-37.pyc delete mode 100644 scripts/exp-006/xquad/__pycache__/utils_qa.cpython-37.pyc delete mode 100644 scripts/exp-006/xquad/eval_germanquad.sh delete mode 100644 scripts/exp-006/xquad/eval_qa.py delete mode 100644 scripts/exp-006/xquad/trainer_qa.py delete mode 100644 scripts/exp-006/xquad/utils_qa.py delete mode 100644 scripts/exp-007/madx_run_clm.py delete mode 100644 scripts/exp-007/run_clm_de.sh delete mode 100644 scripts/exp-007/run_clm_en.sh delete mode 100644 scripts/exp-007/run_clm_ko.sh delete mode 100644 scripts/exp-008/xnli/xnli_de.py delete mode 100644 scripts/exp-008/xnli/xnli_de_mbert.sh delete mode 100644 scripts/exp-008/xnli/xnli_de_mbert_0shot.sh delete mode 100644 scripts/exp-008/xnli/xnli_ko.py delete mode 100644 scripts/exp-008/xnli/xnli_ko_mbert.sh delete mode 100644 scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh diff --git a/scripts/exp-005/download_oscar.py b/scripts/exp-005/download_oscar.py deleted file mode 100644 index 4a2f504..0000000 --- a/scripts/exp-005/download_oscar.py +++ /dev/null @@ -1,11 +0,0 @@ -from datasets import load_dataset -import os -from pathlib import Path -import argparse - -parser = argparse.ArgumentParser(description='') -parser.add_argument('lang', type=str, help='language subset') -args = parser.parse_args() - -dataset = load_dataset("oscar", f"unshuffled_deduplicated_{args.lang}", cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/oscar_{args.lang}") -print("Done") \ No newline at end of file diff --git a/scripts/exp-005/download_oscar_de.sh b/scripts/exp-005/download_oscar_de.sh deleted file mode 100644 index c102b49..0000000 --- a/scripts/exp-005/download_oscar_de.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=3-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-005-download_oscar_de - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/005/download_oscar_de.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/005/download_oscar_de.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -python3 $FP_BIGS/scripts/exp-005/download_oscar.py de \ No newline at end of file diff --git a/scripts/exp-005/run_clm.py b/scripts/exp-005/run_clm.py deleted file mode 100644 index 396fbfa..0000000 --- a/scripts/exp-005/run_clm.py +++ /dev/null @@ -1,551 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. - -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=causal-lm -""" -# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. - -import torch -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional -import pathlib - -import datasets -from datasets import load_dataset - -import transformers -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - Trainer, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.testing_utils import CaptureLogger -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.11.0.dev0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - block_size: Optional[int] = field( - default=None, - metadata={ - "help": "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - model.resize_token_embeddings(len(tokenizer)) - for name, param in model.named_parameters(): - if name not in ('transformer.wte.weight', 'transformer.wpe.weight'): - print(f"🥶 Freeze layer '{name}'") - param.requires_grad = False - else: - param.requires_grad = True - - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - - def tokenize_function(examples): - with CaptureLogger(tok_logger) as cl: - output = tokenizer(examples[text_column_name]) - # clm input could be much much longer than block_size - if "Token indices sequence length is longer than the" in cl.out: - tok_logger.warning( - "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." - ) - return output - - with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") - saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): - tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - print("Sanity check: loaded tokenized_datasets") - else: - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - print("Sanity check: saved tokenized_datasets") - - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") - saved_lm_datasets_fp.parent.mkdir(parents=True, exist_ok=True) - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): - lm_datasets = torch.load(str(saved_lm_datasets_fp)) - print("Sanity check: loaded lm_datasets") - else: - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - torch.save(lm_datasets, saved_lm_datasets_fp) - print("Sanity check: saved lm_datasets") - - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = lm_datasets["train"] - if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_dataset = lm_datasets["validation"] - if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - import gc - del tokenized_datasets - gc.collect() - - # Initialize our Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - print("Checkpoint:", checkpoint) - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - if training_args.push_to_hub: - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - trainer.push_to_hub(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/scripts/exp-005/run_clm_de.sh b/scripts/exp-005/run_clm_de.sh deleted file mode 100644 index fb2d291..0000000 --- a/scripts/exp-005/run_clm_de.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=6-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:4 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-005-run_clm_de - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/005/run_clm_de.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/005/run_clm_de.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -module load gitlfs/2.7.1 -source $FP_BIGS/env_lang_mod/bin/activate - -tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/exp-005/oscar-de-tokenizer" -cache_dir="${FP_BIGS}/data/external/oscar_de" -output_dir="${FP_BIGS}/data/processed/exp-005/ft-gpt2-de" -logging_dir="${FP_BIGS}/reports/exp-005/ft-gpt2-de" - -python $FP_BIGS/scripts/exp-005/run_clm.py \ - --model_name_or_path gpt2 \ - --tokenizer_name $tokenizer_dir \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_de \ - --logging_dir $logging_dir \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 1000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ No newline at end of file diff --git a/scripts/exp-005/train_tokenizer_gpt2.py b/scripts/exp-005/train_tokenizer_gpt2.py deleted file mode 100644 index b90709a..0000000 --- a/scripts/exp-005/train_tokenizer_gpt2.py +++ /dev/null @@ -1,20 +0,0 @@ -from datasets import load_dataset - -import os -from pathlib import Path - - -lang = "de" -dataset = load_dataset("oscar", f"unshuffled_deduplicated_{lang}", cache_dir=f"/users/zyong2/data/zyong2/bigscience/data/external/oscar_{lang}") - -def batch_iterator(): - batch_size = 1000 - for i in range(0, len(dataset), batch_size): - yield dataset['train'][i : i + batch_size]["text"] - -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("gpt2") -assert tokenizer.is_fast -new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50_257) -new_tokenizer.save_pretrained(f"/users/zyong2/data/zyong2/bigscience/data/processed/exp-005/oscar-{lang}-tokenizer") \ No newline at end of file diff --git a/scripts/exp-005/train_tokenizer_gpt2.sh b/scripts/exp-005/train_tokenizer_gpt2.sh deleted file mode 100644 index a273035..0000000 --- a/scripts/exp-005/train_tokenizer_gpt2.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=3-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-005-train_tokenizer_gpt2 - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/005/train_tokenizer_gpt2.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/005/train_tokenizer_gpt2.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -python3 $FP_BIGS/scripts/exp-005/train_tokenizer_gpt2.py \ No newline at end of file diff --git a/scripts/exp-006/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc b/scripts/exp-006/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc deleted file mode 100644 index 928bfcfc22fb936c0683c62ff9f91d578603a56c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5168 zcmd^D%WoXF8K0SbuXZKB?KEw}CP1@`>S3p74?-YUPZc^}z)$W$!z^Y8)&T!Iffd>V+s1w+aKg%<5>^LQJ+8vIHmK`< zEog+zK{IR(TDJAI#p{K|b1OZgyDB2JEX zpqn=yK7N25n9?4lk*EAPq5(|3{`I51&mIZBC#WBB_Aq8V$XobQcReIrKHZ`te+nIL z4Qyr&oNuj>&1~kpZVoD}!D_7j+8$KDwW`*jhOFgxkl1!Cew}iO+kP3z7hL$!*mDYc z_#c&2fuH;w8fCq*-rAXc3d+gsXHI6hj;dt#D;s)MReM{9w+;Mqir<=<^|p0tp|W5{ zd$-@tYcfr^$eXU~N4|31yd9;X8_<0oNYQ{T@+Bo`BuB9bxe&-fUf%%?P?0Z9e8za> zO8+I#+s~;Rrz%O6%sZOa3+|66N`mgHk_V%_?#0o_AA@FFuImL&dY|e4zoP2AbWm zn@-!l=DhsLrHe}0celM|ScuOZ_#u(|@RQ57Wo6bg&~WBtm8_c8vU=9YnprDrXG_d} zvvjhYEi>l>>tqEqY-cN{_Q{p($}@*mSe4b@IKOjFuAW@WuD*pae7l-mD|+xQyEbxQ zHGuUGK;gCpv^uPD(E6h-+F7^kscP1FTRXL0*{pI<6@P+}q1Aye-MKG+g^~Hvh5c%* z0{dNu*>&b**H0b5Nq5#)jqEyWzOhekWH*45&MW8SCiJdlUFhA)Zf3V`gByKlo!riD zkL+w&bGHoGX;BB%(keFv<$$0HewA&EysPH4i3 znb6y(%A05at@ekjWEdyL;h@j>h^B#(M8(DI-8kaI{^oR7+j_RxjpIm5#vXF z+)Pb4(aeNx*x?g|>s3D&s@_Awk3nUm|1=sqo`e z5+3l7N8l`sC!kOK1h_>onxLBsgXG?(hW6(0_j6HQuK#vtm#h0aJP-SXfM&+&BLukW36XQ8D8CwVae>>3eGni&#bl0?LZ0C{%Cz4u~81LLdw2w2wnSo|89z(2IGhwLQ zXrDvK;^c8WQV;3TTp&P4Eho)K(eG6E`*VZZD7U$kmym~3o$UBhJvoQQr`q&Iz%%R` zz^zA6L>zM=&*}nDrl5jCi}N)TdjS-tl0Y3PG{XTu)+jur0Te?3Mr~@U6kr?!@_rC| z2Xanpm-PWHfCO!)Vi z1ep5XgiPomyqs_!(hCqHWB(A)tK&T2FT^5nAuq^DoCZt>XJg2R{(=g>qeM98X$5X{0ns0>w-*R~qpQ;r zQ$;bLO-3l>P_Xul=NU632_T;0S1e)?AyZk3`E$WzipZ&(sLP`E`&eg2_J20bOfX#! z>EdfaOUAnMg&4sTzyxq9SI;Z_OsqSn^e;;v1rIXyKcOSVh1Td)E+QfBh4g6Ecq{l* z0e(HrEJVjSuxL4mO^f9^AkMbY;R6YgtuvI@kpmZE9x#FD$KIz$LB!SCK9vv# zp<$oS+`xdeAaOY}{}s&{#~6ZK3_)8Ez_2ifXbMF+lo*Bd>rkdbMC*P04G)5Nm4wHH z#bZpgUaI)VtAsMhO$qnFD_p%wYJB$l>uDsR%H=P4?{1%vPsmqT*AqM#l6zAQOsbvc z;In)@x%YIY0!)LK&-^*eoRT$OjACiMLG@n<3h-S#mu#IGuVPU!+h%KFG~-3rK3k>t zKe_vKws(IiZ%y~hS1?Doki1-zxbo#GTHTP!gM3+Q%~i(Nq6x79=>=gW$>Yt3@=Q?jrLg6Zc)_drH8$)q1C!f58I>?+Ww7cX)=!E0B&q zj-UpCgFl{@nxX^m#R@cVDUbxd${U4E^9m$*IK<;Izpl?ZuD~mZ{vB}@*WAJeE^{n_ z$7b;XHb21TLu`JC&5yA8F*YAzL!il5&R8SwF6cjR6owtFOoNVjlh=cI?BhLyp-_z4 zxZLyyT_YPa!sSggp*!+a-Y%EGT~E_2fL>c+oe7Jj6gQAT-P}Yj8&Idcpn{>0@(XCJ z`&HYy;ICQH7Y%mXZaX*Z8_pHz!4EDUI(7$I{OiE@CAst>qS&h6-7TK&?d{#&ejTr8 zA{uhMo^f43c^Y89?7B07`nI?Z-v!Dgn$V!dCEBNH zcb4E<=)yle6L4PBLBretHBA7CB8>xqcT%DTO>S@Jo%t{;>i7xoku;eZFLN4her_cC R6(UvN+5kJCx{hX&Y$OHWW4pRXrgDEmf;lN~u7iveXVU$F>3$)bHyqS6P^}c@Z<%@dVCGeg9?S+4| zM#vvHS$q}@u0xYQ0||*oLK+$|9U7i7G(D5z+>EVZ$tzL9%|jx-1&$Tl!?Hmr89H7? z@3CTcSk;`GSEuA_5|%>yh=evb4o$BSTDM8Nd~nUZ(URlsAoitfrM=d@JHn3=F7CDD zhzC(G3R+p(`eDpv7NyCh8;N|#MU?bgnJ;&x`zOdq+J-6@$J9n7S(b`mhh?KN^B?$8 z>~F`sO;z)j;9(TtHQ$e6G3y3+c=4gy_{p04DOrb>wb z11=@&jT0?tKtUP|2JLOTU*6VAh!K@R~_>U!>1VC%7u z2)WZXRW;;um%?1FgV780usDSXO_O+ERil2A3XWcpin8`;Rnt+8aUyxwep$-$iR!|;(|!&NC&p69&eM% ziX)jRON-~q%omw_7-c)k9Km@rPW#>}rinrZ7V_n3fQVhT<0oOvg;z}ztuu(m0fHHJY8gn&l)wuGS-cTapwO(y0~3g8Kmd`#^dksCd4XWQlk%2 zR)ICZ24=gw_>%P#O%4y~D`5l5GZa{yD7 z*AO?yR!Jl7DdO=J#FdtadwQ2-wY)MYL#yX*Xg@Zwd;w-|EpY*u5l~EtIDrKHs>($$ zijy#d<#&)-fr`Xq7KKuLg6zf|CTPPqy{}OrHer`IjpPgx%-{MZaTZ6IR0UQ9@jeo) zW8xB!BGi_+(RTwVRkILkS$njs^B7!Pti2J2;v>BFV>z6)}jiIvnZC@JKpB}56WKbZv~J&w_tz@@K{+m_bN+< yK;U|(j`!=-v{hhUoWnOUhQtR*Q0QZQUd;nn=k~dVc}8Nrw2UTT-iB 3.0, 1, 0) - labels = np.where(labels > 3.0, 1, 0) - # print(predictions) - # print(labels) - # assert False - - # apply metric - metric = load_metric(KLUE_TASKS[args.klue_task].metric.split("/")[0]) - if "/" in KLUE_TASKS[args.klue_task].metric: - return metric.compute(predictions=predictions, - references=labels, - average=KLUE_TASKS[args.klue_task].metric.split("/")[1]) - else: - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500 if not args.use_partial_data else 10, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="steps", - save_strategy="steps", - logging_strategy="steps", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - - -def load_model(args, inference=False): - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0, - cache_dir=args.cache_dir) - if not inference: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv", - load_as=args.adapter_lang_name) - if args.finetune_strategies == "whole": - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "lang_adapters": - model.train_adapter([args.adapter_lang_name]) - elif args.finetune_strategies == "task_adapters": - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - else: - raise ValueError("Lack configuration") - - print(model) - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - else: - print("🔥 ==================== Inference: ==================== 🔥") - assert args.pretrained_adapters_dir - if args.finetune_strategies == "lang_adapters": - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "task_adapters": - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - print(model) - return model - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = AdapterTrainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - model = load_model(args, inference=True) - training_args.report_to = list() - - trainer = AdapterTrainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh b/scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh deleted file mode 100644 index ae705f4..0000000 --- a/scripts/exp-006/xnli/adapters_xnli_ko_gpt2.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-adapters_xnli_ko_gpt2_task_adapters - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/adapters_xnli_ko_gpt2_task_adapters.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/adapters_xnli_ko_gpt2_task_adapters.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="gpt2" - TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-ko" - LANG="ko" - MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/exp-007/madx-gpt2-ko/checkpoint-166500/oscar_ko" - FT_STRATEGIES="task_adapters" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/adapters_xnli_${LANG}_gpt2_lr-${lr}_strategy-${FT_STRATEGIES}" - CACHE_DIR="$FP_BIGS/data/external/hf" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/adapters_xnli_ko.py $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ - --adapter_lang_name "xnli-ko" \ - --finetune_strategies $FT_STRATEGIES -done diff --git a/scripts/exp-006/xnli/xnli_de.py b/scripts/exp-006/xnli/xnli_de.py deleted file mode 100644 index b7b9ace..0000000 --- a/scripts/exp-006/xnli/xnli_de.py +++ /dev/null @@ -1,162 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -print("Arguments: ========") -print(args) - -# load dataset -if args.zero_shot: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -if args.zero_shot: - en_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir=args.cache_dir) - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -logger.info("Tokenizing the dataset...") -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: - en_tokenizer.pad_token = en_tokenizer.eos_token - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="steps", - save_strategy="steps", - logging_strategy="steps", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(args): - return GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0, - cache_dir=args.cache_dir) - - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - - model = load_model(args) - training_args.report_to = list() - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-006/xnli/xnli_de_ft_gpt2.sh b/scripts/exp-006/xnli/xnli_de_ft_gpt2.sh deleted file mode 100644 index b2b344d..0000000 --- a/scripts/exp-006/xnli/xnli_de_ft_gpt2.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-xnli_de_ft_gpt2 - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="yongzx/gpt2-finetuned-oscar-de" - TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-de" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_${lr}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh b/scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh deleted file mode 100644 index 8fe2699..0000000 --- a/scripts/exp-006/xnli/xnli_de_ft_gpt2_0shot.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-xnli_de_ft_gpt2_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_ft_gpt2_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="gpt2" - TOKENIZER_NAME="gpt2" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_gpt2_0shot_$lr" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --zero_shot -done diff --git a/scripts/exp-006/xnli/xnli_de_gpt2.sh b/scripts/exp-006/xnli/xnli_de_gpt2.sh deleted file mode 100644 index ee7427c..0000000 --- a/scripts/exp-006/xnli/xnli_de_gpt2.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-xnli_de_gpt2 - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="gpt2" - TOKENIZER_NAME="gpt2" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_gpt2_${lr}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh b/scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh deleted file mode 100644 index 012da74..0000000 --- a/scripts/exp-006/xnli/xnli_de_gpt2_0shot.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-xnli_de_gpt2_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_de_gpt2_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="yongzx/gpt2-finetuned-oscar-de" - TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-de" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_0shot_$lr" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --zero_shot -done diff --git a/scripts/exp-006/xnli/xnli_ko.py b/scripts/exp-006/xnli/xnli_ko.py deleted file mode 100644 index 37f9c3c..0000000 --- a/scripts/exp-006/xnli/xnli_ko.py +++ /dev/null @@ -1,213 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import GPT2Tokenizer, GPT2ForSequenceClassification - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -KLUE = namedtuple("KLUE", ["klue_split", "num_labels", "metric", "model_type"]) -KLUE_TASKS = { - "topic-cls": KLUE(klue_split="ynat", num_labels=7, metric="f1/macro", model_type="seq-cls"), - "sts-pearsonr": KLUE(klue_split="sts", num_labels=1, metric="pearsonr", model_type="seq-cls"), - "sts-binary": KLUE(klue_split="sts", num_labels=1, metric="f1/macro", model_type="seq-cls"), - "nli": KLUE(klue_split="nli", num_labels=3, metric="accuracy", model_type="seq-cls"), -} - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--klue_task", choices=KLUE_TASKS.keys(), default="nli") -parser.add_argument("--lang", type=str, default="ko") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -print("Arguments: ========") -print(args) - -# load dataset -klue_dataset = load_dataset("klue", KLUE_TASKS[args.klue_task].klue_split, cache_dir=args.cache_dir) -if args.zero_shot: - print("0️⃣ 0-Shot") - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - - if "test" not in klue_dataset: - _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = klue_dataset['validation'] - else: - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = klue_dataset['test'] -else: - print("👀 Supervised Training") - if "test" not in klue_dataset: - _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) - train_dataset = _train_dataset['train'] - val_dataset = _train_dataset['test'] - test_dataset = klue_dataset['validation'] - else: - train_dataset = klue_dataset['train'] - val_dataset = klue_dataset['validation'] - test_dataset = klue_dataset['test'] - - -# load tokenizer -tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -if args.zero_shot: - en_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir=args.cache_dir) - -def tokenize_function(examples): - if KLUE_TASKS[args.klue_task].klue_split == "ynat": - return tokenizer(examples["title"], max_length=128, padding="max_length", truncation=True) - elif KLUE_TASKS[args.klue_task].klue_split == "sts": - return tokenizer(f'{examples["sentence1"]} {tokenizer.eos_token} {examples["sentence2"]}', max_length=128, padding="max_length", truncation=True) - elif KLUE_TASKS[args.klue_task].klue_split == "nli": - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - if KLUE_TASKS[args.klue_task].klue_split == "ynat": - return en_tokenizer(examples["title"], max_length=128, padding="max_length", truncation=True) - elif KLUE_TASKS[args.klue_task].klue_split == "sts": - return en_tokenizer(f'{examples["sentence1"]} {en_tokenizer.eos_token} {examples["sentence2"]}', max_length=128, padding="max_length", truncation=True) - elif KLUE_TASKS[args.klue_task].klue_split == "nli": - return en_tokenizer(f'{examples["premise"]} {en_tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -def postprocessing(example): - if KLUE_TASKS[args.klue_task].klue_split == "sts": - example['labels'] = example['labels']['real-label'] - return example - else: - return example - -logger.info("Tokenizing the dataset...") -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: - en_tokenizer.pad_token = en_tokenizer.eos_token - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False).map(postprocessing) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False).map(postprocessing) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False).map(postprocessing) - full_val_dataset = val_dataset.map(tokenize_function, batched=False).map(postprocessing) -full_test_dataset = test_dataset.map(tokenize_function, batched=False).map(postprocessing) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - -def compute_metrics(eval_pred): - logits, labels = eval_pred - - if "pearsonr" in KLUE_TASKS[args.klue_task].metric: - predictions = logits.flatten() - else: - predictions = np.argmax(logits, axis=-1) - - ### only for STS-binary - if args.klue_task == "sts-binary": - predictions = np.where(logits.flatten() > 3.0, 1, 0) - labels = np.where(labels > 3.0, 1, 0) - # print(predictions) - # print(labels) - # assert False - - # apply metric - metric = load_metric(KLUE_TASKS[args.klue_task].metric.split("/")[0]) - if "/" in KLUE_TASKS[args.klue_task].metric: - return metric.compute(predictions=predictions, - references=labels, - average=KLUE_TASKS[args.klue_task].metric.split("/")[1]) - else: - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - - -def load_model(args): - if KLUE_TASKS[args.klue_task].model_type == "seq-cls": - return GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=KLUE_TASKS[args.klue_task].num_labels, - pad_token_id=0, - cache_dir=args.cache_dir) - - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - model = load_model(args) - training_args.report_to = list() - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh b/scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh deleted file mode 100644 index 648a4c5..0000000 --- a/scripts/exp-006/xnli/xnli_ko_ft_gpt2.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-xnli_ko_ft_gpt2 - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="yongzx/gpt2-finetuned-oscar-ko" - TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-ko" - LANG="ko" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_${lr}" - CACHE_DIR="$FP_BIGS/data/external/hf" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/xnli_ko.py $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh b/scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh deleted file mode 100644 index 8e59776..0000000 --- a/scripts/exp-006/xnli/xnli_ko_ft_gpt2_0shot.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-xnli_ko_ft_gpt2_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/xnli_ko_ft_gpt2_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="yongzx/gpt2-finetuned-oscar-ko" - TOKENIZER_NAME="yongzx/gpt2-finetuned-oscar-ko" - LANG="ko" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/xnli/$LANG/xnli_${LANG}_ft_gpt2_0shot_${lr}" - CACHE_DIR="$FP_BIGS/data/external/hf" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xnli/xnli_ko.py $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --zero_shot -done diff --git a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 deleted file mode 100644 index ddeddb4..0000000 --- a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8 +++ /dev/null @@ -1,124 +0,0 @@ -{ - "lingaccept": { - "cola": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_lingaccept_cola_pfeiffer.json" - } - }, - "default": "adapters/ukp/gpt2_lingaccept_cola_pfeiffer.json" - } - }, - "lm": { - "poem": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_lm_poem_pfeiffer.json" - } - }, - "default": "adapters/ukp/gpt2_lm_poem_pfeiffer.json" - } - }, - "nli": { - "multinli": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_nli_multinli_pfeiffer.json" - } - }, - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_nli_multinli_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_nli_multinli_pfeiffer.json" - }, - "qnli": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_nli_qnli_pfeiffer.json" - } - }, - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_nli_qnli_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_nli_qnli_pfeiffer.json" - }, - "rte": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_nli_rte_pfeiffer.json" - } - }, - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_nli_rte_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_nli_rte_pfeiffer.json" - } - }, - "sentiment": { - "sst-2": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_sentiment_sst-2_pfeiffer.json" - } - }, - "default": "adapters/ukp/gpt2_sentiment_sst-2_pfeiffer.json" - } - }, - "sentiment ": { - "sst-2": { - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_sentiment_sst-2_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_sentiment_sst-2_houlsby.json" - } - }, - "sts": { - "mrpc": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_sts_mrpc_pfeiffer.json" - } - }, - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_sts_mrpc_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_sts_mrpc_pfeiffer.json" - }, - "qqp": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_sts_qqp_pfeiffer.json" - } - }, - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_sts_qqp_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_sts_qqp_pfeiffer.json" - }, - "sts-b": { - "9076f36a74755ac4": { - "versions": { - "ukp": "adapters/ukp/gpt2_sts_sts-b_pfeiffer.json" - } - }, - "b1017368d7a97b11": { - "versions": { - "ukp": "adapters/ukp/gpt2_sts_sts-b_houlsby.json" - } - }, - "default": "adapters/ukp/gpt2_sts_sts-b_pfeiffer.json" - } - } -} \ No newline at end of file diff --git a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json deleted file mode 100644 index 857f977..0000000 --- a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.json +++ /dev/null @@ -1 +0,0 @@ -{"url": "https://raw.githubusercontent.com/Adapter-Hub/Hub/master/dist/v2/index/gpt2.json", "etag": "W/\"b52e703eb3a35aeb1ba6beaec6300d7d265a1a585fe67273fa7c6e4622a63cd4\""} \ No newline at end of file diff --git a/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.lock b/scripts/exp-006/xnli/~/.cache/torch/adapters/788fe0071ca04df2a05115962a1db807e0fe0e444ee204662dd7c7d613f11ba2.e6cf48c1510f737c7099a2423e05ef7d46c9ba83614b4c527017571b9d2898b8.lock deleted file mode 100755 index e69de29..0000000 diff --git a/scripts/exp-006/xquad/GPT2ForQuestionAnswering.py b/scripts/exp-006/xquad/GPT2ForQuestionAnswering.py deleted file mode 100644 index bb91cd6..0000000 --- a/scripts/exp-006/xquad/GPT2ForQuestionAnswering.py +++ /dev/null @@ -1,129 +0,0 @@ -import torch -from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss -from torch.nn import functional as F - -from transformers import GPT2PreTrainedModel, GPT2Model - - -# warnings can be ignored: -# https://discuss.huggingface.co/t/gpt2lmheadmodel-from-pretrained-gpt2-not-loading-attn-weights/432 - -class GPT2ForQuestionAnswering(GPT2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.transformer = GPT2Model(config) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - self.is_parallelizable = False - - self.init_weights() - - def forward( - self, - input_ids=None, - past_key_values=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - input_mask=None, - head_mask=None, - inputs_embeds=None, - use_cache=False, - output_attentions=False, - return_dict=False, - start_positions=None, - end_positions=None, - ): - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - - Examples:: - - from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple - import torch - - tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') - model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') - - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - start_positions = torch.tensor([1]) - end_positions = torch.tensor([3]) - - outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - loss = outputs[0] - - """ - - outputs = self.transformer( - input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - input_mask=input_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - return_dict=return_dict - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - - outputs = (start_logits, end_logits,) + outputs[2:] - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - outputs = (total_loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) diff --git a/scripts/exp-006/xquad/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc b/scripts/exp-006/xquad/__pycache__/GPT2ForQuestionAnswering.cpython-37.pyc deleted file mode 100644 index f15d2f948c6dc94e3d74735b459c40c14f0a13c4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5202 zcmd^DNpBpv6|Sn@Tiuc^uaikKoC%QWMzq)yU=BhQ%-FDI4zl3HIt1+))$Ag>yR6zw zvSdjq ze)qRO)4O*q>+d+2JZ0#71^;p%8fGy&uzL9G1XgJGY#aNfzzIveQdsVl^|%b zm7o^Zd-brGJZ{PYoaU`lhCMxOHHhz2nA#y3y)zjz|}zMy`@+2fe;AaCGH-Sv?0SUpSfNI;xb}uWjg+Rpnh3-q!HT8GfrXeBa2dcg-^k zRR;Sydpq5wydu+ti@ff-e&j3H&6`mgx&a;VK#CgdlrJekBRPyk$b~>g^6D-qf{J`; z>@&tASNbPBZ@#2%oT?;MGQaLiH=%+C0T28WItX~)(&S%pe>7GSv|5%t80J+kj)wjS zblz}XFQ8Jo?q95b>~D^eq1s^-gxumIF3eNY!3X9^n4z9d{cVDPn6u`N6GrV zdtYuIy-X?Fyh@WB$+1BEp@C!z8q03kb*E{s+EwS|(<_%v)UvnZE#oS90KA&aL->`; zwq<433$Q`vWTmW}RkCVU%j#JpYi3K#e!FzKoGmlwBkObpv~OoCXZGo}?Ai;5l~|co z-a5Z^POqP?X4l`r7`|Q4R*N3I%T|XDtOhv$0Vv$DfL4pu4jO;7MKfzpda9hY-c`=5 z*ETC1l*ON5WN5YEOMC9iUtwgvv|+yrE5Uv@V0M)`*^M&?kkp>_RV};0>Tm7So7qj^ zr1jc4y#>A1tPQ=}*{$sM9dNXdtAGe{zk4}R z0xl+f#xGF7y`oqVMUhEAg8iuH#c8B2kz|o+9Px1QtaqF>*?;Lbuz2~P5A4A?aynhew@ zz9fl=4*~M*oO|!biU!74`4O{zh|w(6_p9onWdc2DHftg&Ydjp7A_mh9m*RQ~Zj>EFxqo zOEG^gcuWyFbrW?})cz3b%*g)NrkM$*>mglyEojMDcfJrKcsLjXF6G)qg`bOc7nJ_> z#7Dt{%sGJ2q2fYobSf8-5cfiQv}U{&{HXxHo@N%J;~a89M`#KC9#0eNQyL3WXm$(x zKRSYHEZ}l$YlerRh(j`;@`xWMu~0;3zI`nE;Cv(2%=ofn+EZ8%@ad+-avcz7+vxCt zgvjHUuy% z%psaWQ4S?WA^irFsSwdRpMA@NAYLQkF=6ot6Rnpj{@EI#402P#J@5)wuaX*{-R?#j zNvLx93GdwN67m!B0PA{!XGU^=%7IC>(;R%Bk7xIv&s2bE@amaAhnZ8d#*0xbtv9It z3qb+Ci|3NUX&uk!~&7!t`EnOXlWZr-T=Xwx4d49c!=F?qdzGULQ z%Y07>IJa8wb@NYn-~e7?e)t}b_jpOt@<$QWAaEyur=_~U6SG)>2JQ=zz*l*#uxVa` z1P_OKyaA}{vyLn9lA?Q8tm2wG*x-p-e1y#pu=yC9A7b+(Y<`T*C)f~dK7}S@z@mQvh826b~AxvW8*_KFIILdvh8u^yIf=aRpA$y_xw?WS|nzUf@En>PHz^+U^U zL5qJa7@v?UFCvPqy6wH<+1c6I+v`^G<|d*c$D1411(c@&_FJwC_aTA#qHjLLm%v}% zF0NTTaAVh<3DmX4P53TQE>VXDEiTbWb-UGqE2B&Q^h~@&(m}&q4%JNni6V^yfwxwo r0!?o3X$fuBHWW4pRXrh8TKb_@N~u7iveXVz6FjI+rzR!C>c6l zMeng&+WKM4WfrB$rR#}&z(thwTbVC+rTZtyNZN)f7su2_Bw3bH zQS5KWyiHZ}rr==|;5FZmVKM6lczE%l+W5&EKsfOzsBVB_p!_rAA?bnAp>^bXW?17^ zXg{N2IdqP|2w}aLyI~dP_CsP3uMEmH)aF9&rPzg-T@bm>@B1<1d;V|~bE!^_Qkk)l zNCPe<>x~mFX+S|516m2132gZKJM_In;OJ(mqz#i;XVN834v0usGUv>JUp6rBv=6B?_ zAMA7v_S2+)rL!INWf1Ws;2jx=Xq3qg-y2=JeEHMP-o3FOcCxuY**$-2v@a^~_S?`T zN^4L5ku`Dd08nxp&;AHE3>?@~5r4r5oG~9w9dHiKok3-0PAVXYnO8s( zckgQE=I*pQBZu?``RU@MnpdZ_+$x$b@&&Bc^XiP|?wp%fpOWY1RdVme9e8Wn z$lVzPKw?E?;=+s8VC6BDjvh51Q?Ze)PS>&%)Abp_eZW6C-N037r=V@Zc^EV3A#6%r z&;1f?J@yeNx7((whJ5Z=n5%U#dVwJprx2%U67Q>O)K5~u(JN9>);_IjI<_%dB+tAP zeV(x;Cr_&dA_ zu?m>f=!3*nU=1)iFb1Z0A0tv@t@JOfU3&x8sK%Pcquhb>bG*m7a|_?H3%r#>Yi4Q0 z(Iw&>z*OZG#Lcl)(ujM4czg+Qr6uB?+#y*luMEo2>bV=*k4!9LfSH?1TmWVSlvE;4 zAVI&XauJN;B+Ov>Eo4@pF7cQ}p%fn>yD^6e+VFMnYgC9$*d{1w4+SFfQe9Q&GoFh#Y~n1+A;Al|$D{&9>&X zS5%p?Fb(i2C~%4FrDDj16j-VS)@gO-W&YNp3XZcVmfAbs=KOEUUhHoLkUh6xfC}(f zSvdD9ONKz;dZ&)}>(sPWU|yWZH!+69dq_~|V|`xE16SwvxrTX0V!gDCCScx%Y_Mkj E1Bt}9KL7v# diff --git a/scripts/exp-006/xquad/__pycache__/utils_qa.cpython-37.pyc b/scripts/exp-006/xquad/__pycache__/utils_qa.cpython-37.pyc deleted file mode 100644 index 85335495cb3f1c0a2d45627e5c52893967850bb8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13232 zcmeHOO>7)TcJ9CV;gB2-$q|1gszv>AByvbe{@GPzt-Wi@@Mc%GWyNy1s~t93J!Dfe z-NWi0iDYWBiKX3~ykU}KWPn5h1j!-DA=v~8VEAO@l!F5-f<{gWuz-Rjmn6=@hfTg$ z-SbC@TFLPuSil_ho9g;`_3G8DS5@yDeYL+oqu|f_@O|rF-d2=hjxIH1Mrg64m5lvYJ{jgXk1}E_tPxHdhaC` z55-*iSYL}z`jLEySCdD?k2v+;?_pUs08S%JdrK+gwl8Gf@cdw|#=RBW_Z@fDtOriT z2^`xu%N{p3>b4&^o@-jJf5+yiP|Q4lf|RQTElj^&qcW=!Ca%|O6}zB@MzF!ES03b1 z}-ZT6>k+y zZ`+j>Irk7{0T0dg*YNm1N+?R82<qT5Qk7hy?L)OCNg3S zrQ@PkeoqLMzF8J|lTmuNj}5T?J*{zAjFYCs1kq8YaYT%;{#)rC)vqCyL&=d_#*WIr z&j!Vj&bn;qzR?&7`ggNpd{1S=!su04eoSG*-vg)J0gQf1mQEZ{dUW$gVhXzayN0>> zkvNLf2aRLGgbpS#V+ly`u{hTCHYrS64^q01#iTeEziE7EN8LbeVcvoUsGTq&Nq$rj z2}rN;^RoOf+VnmD}M7vzKq`#H3y?P&frSw4yxjQSvrMEtQ1%bFOcp2Y<92%a5p zqy;%s=>G=#-uPQdM{X!WM^Aqxi025^h_u<54hF?28)2jO6gI}j-_u#YFun(0($Hj! zPl6#f!6xs)g312ws*vszp|!Z=*nU=d+L~iywd`o-LXAoQfbU=n!&i3!CEDGhhV)0$XtMAC#7V6E;Xh5 zKW@t)aw1PF@RH(gGrf`BG=(N$yDj+)o)KA=e_m;1F}8mN2U*@YBc{a}c4}N1Q}BJ7 zz8hzQ332wWDu$Xo#aWSrO&I(QcC@3X99m%f?Ho4K9;X;=w%+(p^p=u#l;9r+lk10f zkBHQsx?}je!IVg06&K!@zDSlG6)BOH7HjYyvNLX4q*3GS`x;i>>_avFfz8G>=46dI zSVV&N?W>pI3M_>^c5J`px*P3~a{HA6 z#NBvqWy|DtQ0K1c2NtBUTxQyMu*EsxosoUM#8>@`GDQQoH?1mmX8$5OwoTV_vDMpg zcQ)3f6+ni-obhd2id3~6a9`n0ErarAD*3)GIP;e zUca~`PZr;DYekn?oLgJ7W;=azsn8xO4VMPl$s?MJXdW+`!5z#T@$lt@V5oH9p_z#E z8R=1~l``9^h1&g^jd{T_VykGr7WgeKP_Jc&`+>=vs_oK|#;n%y3Vp4tN#<_7x@>c@ z3%g>YP9SY08eLO~OKrU#a4<4seJp!kWog!2^5U}>(#$Sf^$OU+G9$KEtcq{%SI&(! z8^eMi9EKom*{r+F=9Mi{L(>$x#2xlM-V;|9TDrp@)*bm0(ET1V_?>6AY`h-xVl53j^#L>|@9F!gU(aK3TR%l9qY+@Z;sO z&21MKVwiPs$A-U&&6`Ozcl8&a1ZJaCfOQ*Yh#hpPt*Z_NVz{`HelE(RZ6xvw*0OgS zwnAqR*}GKPQqhP@qkV{#tkArc94_-%CqqfCJ5H@y1KnuD*8l z)usJgOsmGy5;4cJ$Bt>v*u~W%wyZ}qm!dq-a1-DDCJEh)4=kCu($_2GJYn1cMHixd zi1zT-Haa8s?NYRF&(ek+rwc`{5>Mk*c+u^_E~6p~+Rn6=TXBVI(w~mU!1ER!e+0zA zZTe;w_Y4&`DntZ+PunowQvO)sITRN3P%AEksuQZWWQHW4zC6ELEBo`bo6h^oSmg8D zTb{dmety|m^;aA+tNG}B<K2=)z>8nnu!^;tA;D7MNs8t+EvLcngTUcH`}`B6m<&Imnv3unOT?r z5O4p1N7W2m7Dl_6q-=#loef`|>` zdk|CSlc>?psv%S<1xzmU5%d!5)?9YHG~V)$Sp+En+!7! z!l-0S9zUOWa zb~z)NHR8%Z?n6~aO9vEW=@3)$81svg!9ZzW#>iWiJdJg*q`fUvi=lew0r_r>G3nq^ z;|8Ic*>2Wv_b2a2+xeJJi#>5EoCtLs^25yAR;6yg!nw!EOVz9L-Vye$cojMVW7qY= z-uTKzUOF^L*{B<|7OdCgd$P)Zs2d9bP!p{dzUjx1Aa%^9?o6 z4~?3IXnDf%DH@)8#l7ut&!uBSG%{n|bKKDIAaj_w!(kVpK}ru3tX{48g@rKPj2(pO z=FybXD&gdhIU*|xr-(YIP&)4;(oZA{vY5kxw$$P7{lzThw0yoF4!5_ZQs9+n&Njn* zv!hbY=B2m>9|A`#nkd{54iKXb0|+JOXolqga?v3?0taMHf2%LqODcX12V$PoyMyE2 z;xLZ$kCdg@$HQ;M;NG?c3f!c#y%*aMk zTj4-^*rl>lvBONHSD9|F&ZsOYt%mP&AmPNp{^=-$^gjYo=K9pMn%45_v_7I{)FE|3 z$Gd8%X+5Kw`h=F&4As!H8glhMJ*Vf?F-=o7HL3O)Q|c7jWQ?pTKboN(MqNCrp^YFf z2@XjugO(Hegqi_o)K;gAqB>&a)x6rL=hdv9R7Z3JZ&}pKt2xxjYB?jR<_)BfPO69F z2Pu6>?Ng^AC(&f|t?^BaOHm7~))H|OMog#cGLZ=4zp;JSRVMMKSVz>jH8ltH0#=9+aDjy?Khk#Cf z7^Pi}^#au)geoKB$b?W4E`==^aov8TfJq<)%#6x+w9P;5La4Nx_pt?!0)qn1K^PSH zO#q>q1VY8dxJ0PdfI9UG!ZD5jKcnv{;92y2RAlA1DF)>CF`VF$FOjS1C@qnzUc}Fj z1GDOtSkSKn@0lRvhrcV1H73xL7W9jxMCv|i3~l~9-cs`IpBg8Ete6tORe|lD1S*iq z10@0Obqe{W0Cs@<)8Zt?IUS==_r>Y1w`p+_5=A}V2jT#nLB466@E_C`CnX9cQ$V3| zqfHdbl;y`!n^341#c`le1L7o5s6mNBy(r5kf%Qav5DFFX2Mz}m>Ilg#j*^t%+40ta zLfx16(+?yyG1ECQ$%a`TXwxA!0<@{Og*J^bnxB~#l2j0d-9a`6{N@bsnllnfIztGc zM3M?@9C(ek@xyK;i6?_*99qbWu?Qmq_Mr3c17n&1$}`f;1!~pm zTO4ZUZu}HH(o+8h|4A^hKDm2Xq=DY({zt(PiOo#M=na)kiL~gEbF1@zWXFNI^uY2@ zNX%tkw(hL)Q+6`0aZ1)$fMfwf+J%h%eGZT9M9qumj8J{IvtxI-LDTua~=caAf^?aM-Wu< z%{w+lTO+_>r9$Ud+`c<+>tEnLU*JAp;64=21m?Ey*FsMg*ZhUVeWL4uJWxK_D4*RI zoxd*emL@cbBWgDa8hD<0%eHF~WDxmH>Cz;~V#g3c2L!m(?Udw%fY6=hbCnVdDYgv6 z8@}-Kk+B`EZ>sVOWFtoN{4bCVX~`|{V(Dp+4Zejt-S&m0_PyFa5ZNf>*99$TetDgmn zXh%~xg%Yk53>oK^ap*AHioqaK<1L4MaTG?rwL-m53M_DwB;6MqJw|!pz^9See&Mqz zqYX3g{GpC_eL5SE`Ka-IkUVq_q-xj=>~<5?p6EP{gbI~^+e`e(F> zCOpy=t>@2y`_G!7cJIF!v3mYIsqsrhzD&bY`9(_q63A1A>yPtG)ba%)mx+9Z$csc? zBJx!tUnBA|kynUZAwt1-{wheA;Fh~;hdPi>{&mXzWg=&Ze3OW@i>s7+oyZ$R-XwC3 z2!-1DTSTrC`4uAgUj+)kLF64Gze;40$hSZW7x=d+eUr$eLh0L7@M}c6)b_7m|BlQ+}xvb>G)a^Y+j?h4I-R~Pb45x zCvuy}9U{3F$s-k>eTrrGUufX}35YUFLFnv5LFnAWA!rI#$ARcPLd`h@i2LaCxDa%L zLfTQ_IolO@&Le>Ql!4~og0zLvFjXosZ>3axT&MB` z@T=NZgfjKe|IH{`bwGHu1i~KRPv3~~sZxo48Uog|R>D;L`;IUb|L8I_=x2xgyTn(T zy<9C6|K=oH$9%|{k&}|XgkP`LEB0kVH~j?=g!ki54uSnXJ(Hr4e)DtjyULr*boBkd D(iz@< diff --git a/scripts/exp-006/xquad/eval_germanquad.sh b/scripts/exp-006/xquad/eval_germanquad.sh deleted file mode 100644 index d18edbe..0000000 --- a/scripts/exp-006/xquad/eval_germanquad.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-006-eval_germanquad - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/006/eval_germanquad.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/006/eval_germanquad.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="gpt2" - TOKENIZER_NAME="gpt2" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-006/germanquad/$lr" - CACHE_DIR="$FP_BIGS/data/external/xquad" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-006/xquad/eval_qa.py \ - --output_dir $OUTPUT_DIR \ - --dataset_name "deepset/germanquad" \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 50 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --model_name_or_path $MODEL_NAME \ - --tokenizer_name $TOKENIZER_NAME \ - --do_train \ - --do_predict -done diff --git a/scripts/exp-006/xquad/eval_qa.py b/scripts/exp-006/xquad/eval_qa.py deleted file mode 100644 index 402ff31..0000000 --- a/scripts/exp-006/xquad/eval_qa.py +++ /dev/null @@ -1,655 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer. -""" -# You can also adapt this script on your own question answering task. Pointers for this are left as comments. - -import logging -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import datasets -from datasets import load_dataset, load_metric - -import transformers -from trainer_qa import QuestionAnsweringTrainer -from transformers import ( - AutoConfig, - AutoModelForQuestionAnswering, - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - HfArgumentParser, - PreTrainedTokenizerFast, - TrainingArguments, - default_data_collator, - set_seed, -) - -from GPT2ForQuestionAnswering import GPT2ForQuestionAnswering -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version -from utils_qa import postprocess_qa_predictions - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -# check_min_version("4.13.0.dev0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") - -logger = logging.getLogger(__name__) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - - model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - test_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - max_seq_length: int = field( - default=384, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - pad_to_max_length: bool = field( - default=True, - metadata={ - "help": "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " - "be faster on GPU but will be slower on TPU)." - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - max_predict_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " - "value if set." - }, - ) - version_2_with_negative: bool = field( - default=False, metadata={"help": "If true, some of the examples do not have an answer."} - ) - null_score_diff_threshold: float = field( - default=0.0, - metadata={ - "help": "The threshold used to select the null answer: if the best answer has a score that is less than " - "the score of the null answer minus this threshold, the null answer is selected for this example. " - "Only useful when `version_2_with_negative=True`." - }, - ) - doc_stride: int = field( - default=128, - metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, - ) - n_best_size: int = field( - default=20, - metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, - ) - max_answer_length: int = field( - default=30, - metadata={ - "help": "The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another." - }, - ) - - def __post_init__(self): - if ( - self.dataset_name is None - and self.train_file is None - and self.validation_file is None - and self.test_file is None - ): - raise ValueError("Need either a dataset name or a training/validation file/test_file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - if self.test_file is not None: - extension = self.test_file.split(".")[-1] - assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - else: - data_files = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - extension = data_args.train_file.split(".")[-1] - - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = data_args.validation_file.split(".")[-1] - if data_args.test_file is not None: - data_files["test"] = data_args.test_file - extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=True, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - tokenizer.pad_token = tokenizer.eos_token - - # model = AutoModelForQuestionAnswering.from_pretrained( - # model_args.model_name_or_path, - # from_tf=bool(".ckpt" in model_args.model_name_or_path), - # config=config, - # cache_dir=model_args.cache_dir, - # revision=model_args.model_revision, - # use_auth_token=True if model_args.use_auth_token else None, - # ) - - model = GPT2ForQuestionAnswering.from_pretrained(model_args.model_name_or_path, num_labels=2, pad_token_id=0) - - # Tokenizer check: this script requires a fast tokenizer. - if not isinstance(tokenizer, PreTrainedTokenizerFast): - raise ValueError( - "This example script only works for models that have a fast tokenizer. Checkout the big table of models " - "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " - "requirement" - ) - - # Preprocessing the datasets. - # Preprocessing is slighlty different for training and evaluation. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - elif training_args.do_eval: - column_names = raw_datasets["validation"].column_names - else: - column_names = raw_datasets["test"].column_names - question_column_name = "question" if "question" in column_names else column_names[0] - context_column_name = "context" if "context" in column_names else column_names[1] - answer_column_name = "answers" if "answers" in column_names else column_names[2] - - # Padding side determines if we do (question|context) or (context|question). - pad_on_right = tokenizer.padding_side == "right" - - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - # Training preprocessing - def prepare_train_features(examples): - # Some of the questions have lots of whitespace on the left, which is not useful and will make the - # truncation of the context fail (the tokenized question will take a lots of space). So we remove that - # left whitespace - examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] - - # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results - # in one example possible giving several features when a context is long, each of those features having a - # context that overlaps a bit the context of the previous feature. - tokenized_examples = tokenizer( - examples[question_column_name if pad_on_right else context_column_name], - examples[context_column_name if pad_on_right else question_column_name], - truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=data_args.doc_stride, - return_overflowing_tokens=True, - return_offsets_mapping=True, - padding="max_length" if data_args.pad_to_max_length else False, - ) - - # Since one example might give us several features if it has a long context, we need a map from a feature to - # its corresponding example. This key gives us just that. - sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") - # The offset mappings will give us a map from token to character position in the original context. This will - # help us compute the start_positions and end_positions. - offset_mapping = tokenized_examples.pop("offset_mapping") - - # Let's label those examples! - tokenized_examples["start_positions"] = [] - tokenized_examples["end_positions"] = [] - - for i, offsets in enumerate(offset_mapping): - # We will label impossible answers with the index of the CLS token. - input_ids = tokenized_examples["input_ids"][i] - # cls_index = input_ids.index(tokenizer.cls_token_id) - - # Grab the sequence corresponding to that example (to know what is the context and what is the question). - sequence_ids = tokenized_examples.sequence_ids(i) - - # One example can give several spans, this is the index of the example containing this span of text. - sample_index = sample_mapping[i] - answers = examples[answer_column_name][sample_index] - # If no answers are given, set the cls_index as answer. - if len(answers["answer_start"]) == 0: - logger.debug("Yes") - assert False - # tokenized_examples["start_positions"].append(cls_index) - # tokenized_examples["end_positions"].append(cls_index) - else: - # Start/end character index of the answer in the text. - start_char = answers["answer_start"][0] - end_char = start_char + len(answers["text"][0]) - - # Start token index of the current span in the text. - token_start_index = 0 - while sequence_ids[token_start_index] != (1 if pad_on_right else 0): - token_start_index += 1 - - # End token index of the current span in the text. - token_end_index = len(input_ids) - 1 - while sequence_ids[token_end_index] != (1 if pad_on_right else 0): - token_end_index -= 1 - - # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). - if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): - tokenized_examples["start_positions"].append(cls_index) - tokenized_examples["end_positions"].append(cls_index) - else: - # Otherwise move the token_start_index and token_end_index to the two ends of the answer. - # Note: we could go after the last offset if the answer is the last word (edge case). - while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: - token_start_index += 1 - tokenized_examples["start_positions"].append(token_start_index - 1) - while offsets[token_end_index][1] >= end_char: - token_end_index -= 1 - tokenized_examples["end_positions"].append(token_end_index + 1) - - return tokenized_examples - - if training_args.do_train: - if "train" not in raw_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = raw_datasets["train"] - if data_args.max_train_samples is not None: - # We will select sample from whole data if argument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - # Create train feature from dataset - with training_args.main_process_first(desc="train dataset map pre-processing"): - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) - if data_args.max_train_samples is not None: - # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - # Validation preprocessing - def prepare_validation_features(examples): - # Some of the questions have lots of whitespace on the left, which is not useful and will make the - # truncation of the context fail (the tokenized question will take a lots of space). So we remove that - # left whitespace - examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] - - # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results - # in one example possible giving several features when a context is long, each of those features having a - # context that overlaps a bit the context of the previous feature. - tokenized_examples = tokenizer( - examples[question_column_name if pad_on_right else context_column_name], - examples[context_column_name if pad_on_right else question_column_name], - truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=data_args.doc_stride, - return_overflowing_tokens=True, - return_offsets_mapping=True, - padding="max_length" if data_args.pad_to_max_length else False, - ) - - # Since one example might give us several features if it has a long context, we need a map from a feature to - # its corresponding example. This key gives us just that. - sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") - - # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the - # corresponding example_id and we will store the offset mappings. - tokenized_examples["example_id"] = [] - - for i in range(len(tokenized_examples["input_ids"])): - # Grab the sequence corresponding to that example (to know what is the context and what is the question). - sequence_ids = tokenized_examples.sequence_ids(i) - context_index = 1 if pad_on_right else 0 - - # One example can give several spans, this is the index of the example containing this span of text. - sample_index = sample_mapping[i] - tokenized_examples["example_id"].append(examples["id"][sample_index]) - - # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token - # position is part of the context or not. - tokenized_examples["offset_mapping"][i] = [ - (o if sequence_ids[k] == context_index else None) - for k, o in enumerate(tokenized_examples["offset_mapping"][i]) - ] - - return tokenized_examples - - if training_args.do_eval: - if "validation" not in raw_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_examples = raw_datasets["validation"] - if data_args.max_eval_samples is not None: - # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) - # Validation Feature Creation - with training_args.main_process_first(desc="validation dataset map pre-processing"): - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) - if data_args.max_eval_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - if training_args.do_predict: - if "test" not in raw_datasets: - raise ValueError("--do_predict requires a test dataset") - predict_examples = raw_datasets["test"] - if data_args.max_predict_samples is not None: - # We will select sample from whole data - predict_examples = predict_examples.select(range(data_args.max_predict_samples)) - # Predict Feature Creation - with training_args.main_process_first(desc="prediction dataset map pre-processing"): - predict_dataset = predict_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) - if data_args.max_predict_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - - # Data collator - # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data - # collator. - data_collator = ( - default_data_collator - if data_args.pad_to_max_length - else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) - ) - - # Post-processing: - def post_processing_function(examples, features, predictions, stage="eval"): - # Post-processing: we match the start logits and end logits to answers in the original context. - predictions = postprocess_qa_predictions( - examples=examples, - features=features, - predictions=predictions, - version_2_with_negative=data_args.version_2_with_negative, - n_best_size=data_args.n_best_size, - max_answer_length=data_args.max_answer_length, - null_score_diff_threshold=data_args.null_score_diff_threshold, - output_dir=training_args.output_dir, - log_level=log_level, - prefix=stage, - ) - # Format the result to the format the metric expects. - if data_args.version_2_with_negative: - formatted_predictions = [ - {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() - ] - else: - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] - - references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] - return EvalPrediction(predictions=formatted_predictions, label_ids=references) - - metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") - - def compute_metrics(p: EvalPrediction): - return metric.compute(predictions=p.predictions, references=p.label_ids) - - # Initialize our Trainer - trainer = QuestionAnsweringTrainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - eval_examples=eval_examples if training_args.do_eval else None, - tokenizer=tokenizer, - data_collator=data_collator, - post_process_function=post_processing_function, - compute_metrics=compute_metrics, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Prediction - if training_args.do_predict: - logger.info("*** Predict ***") - results = trainer.predict(predict_dataset, predict_examples) - metrics = results.metrics - - max_predict_samples = ( - data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) - ) - metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) - - trainer.log_metrics("predict", metrics) - trainer.save_metrics("predict", metrics) - - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - if training_args.push_to_hub: - trainer.push_to_hub(**kwargs) - else: - trainer.create_model_card(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/exp-006/xquad/trainer_qa.py b/scripts/exp-006/xquad/trainer_qa.py deleted file mode 100644 index 3e005e9..0000000 --- a/scripts/exp-006/xquad/trainer_qa.py +++ /dev/null @@ -1,105 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A subclass of `Trainer` specific to Question-Answering tasks -""" - -from transformers import Trainer, is_torch_tpu_available -from transformers.trainer_utils import PredictionOutput - - -if is_torch_tpu_available(): - import torch_xla.core.xla_model as xm - import torch_xla.debug.metrics as met - - -class QuestionAnsweringTrainer(Trainer): - def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): - super().__init__(*args, **kwargs) - self.eval_examples = eval_examples - self.post_process_function = post_process_function - - def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): - eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset - eval_dataloader = self.get_eval_dataloader(eval_dataset) - eval_examples = self.eval_examples if eval_examples is None else eval_examples - - # Temporarily disable metric computation, we will do it in the loop here. - compute_metrics = self.compute_metrics - self.compute_metrics = None - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - try: - output = eval_loop( - eval_dataloader, - description="Evaluation", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only - prediction_loss_only=True if compute_metrics is None else None, - ignore_keys=ignore_keys, - ) - finally: - self.compute_metrics = compute_metrics - - if self.post_process_function is not None and self.compute_metrics is not None: - eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) - metrics = self.compute_metrics(eval_preds) - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - self.log(metrics) - else: - metrics = {} - - if self.args.tpu_metrics_debug or self.args.debug: - # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) - xm.master_print(met.metrics_report()) - - self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) - return metrics - - def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): - predict_dataloader = self.get_test_dataloader(predict_dataset) - - # Temporarily disable metric computation, we will do it in the loop here. - compute_metrics = self.compute_metrics - self.compute_metrics = None - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - try: - output = eval_loop( - predict_dataloader, - description="Prediction", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only - prediction_loss_only=True if compute_metrics is None else None, - ignore_keys=ignore_keys, - ) - finally: - self.compute_metrics = compute_metrics - - if self.post_process_function is None or self.compute_metrics is None: - return output - - predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") - metrics = self.compute_metrics(predictions) - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) \ No newline at end of file diff --git a/scripts/exp-006/xquad/utils_qa.py b/scripts/exp-006/xquad/utils_qa.py deleted file mode 100644 index dedbd85..0000000 --- a/scripts/exp-006/xquad/utils_qa.py +++ /dev/null @@ -1,431 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Post-processing utilities for question answering. -""" -import collections -import json -import logging -import os -from typing import Optional, Tuple - -import numpy as np -from tqdm.auto import tqdm - - -logger = logging.getLogger(__name__) - - -def postprocess_qa_predictions( - examples, - features, - predictions: Tuple[np.ndarray, np.ndarray], - version_2_with_negative: bool = False, - n_best_size: int = 20, - max_answer_length: int = 30, - null_score_diff_threshold: float = 0.0, - output_dir: Optional[str] = None, - prefix: Optional[str] = None, - log_level: Optional[int] = logging.WARNING, -): - """ - Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the - original contexts. This is the base postprocessing functions for models that only return start and end logits. - - Args: - examples: The non-preprocessed dataset (see the main script for more information). - features: The processed dataset (see the main script for more information). - predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): - The predictions of the model: two arrays containing the start logits and the end logits respectively. Its - first dimension must match the number of elements of :obj:`features`. - version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the underlying dataset contains examples with no answers. - n_best_size (:obj:`int`, `optional`, defaults to 20): - The total number of n-best predictions to generate when looking for an answer. - max_answer_length (:obj:`int`, `optional`, defaults to 30): - The maximum length of an answer that can be generated. This is needed because the start and end predictions - are not conditioned on one another. - null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): - The threshold used to select the null answer: if the best answer has a score that is less than the score of - the null answer minus this threshold, the null answer is selected for this example (note that the score of - the null answer for an example giving several features is the minimum of the scores for the null answer on - each feature: all features must be aligned on the fact they `want` to predict a null answer). - - Only useful when :obj:`version_2_with_negative` is :obj:`True`. - output_dir (:obj:`str`, `optional`): - If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if - :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null - answers, are saved in `output_dir`. - prefix (:obj:`str`, `optional`): - If provided, the dictionaries mentioned above are saved with `prefix` added to their names. - log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): - ``logging`` log level (e.g., ``logging.WARNING``) - """ - if len(predictions) != 2: - raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") - all_start_logits, all_end_logits = predictions - - if len(predictions[0]) != len(features): - raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") - - # Build a map example to its corresponding features. - example_id_to_index = {k: i for i, k in enumerate(examples["id"])} - features_per_example = collections.defaultdict(list) - for i, feature in enumerate(features): - features_per_example[example_id_to_index[feature["example_id"]]].append(i) - - # The dictionaries we have to fill. - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - if version_2_with_negative: - scores_diff_json = collections.OrderedDict() - - # Logging. - logger.setLevel(log_level) - logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") - - # Let's loop over all the examples! - for example_index, example in enumerate(tqdm(examples)): - # Those are the indices of the features associated to the current example. - feature_indices = features_per_example[example_index] - - min_null_prediction = None - prelim_predictions = [] - - # Looping through all the features associated to the current example. - for feature_index in feature_indices: - # We grab the predictions of the model for this feature. - start_logits = all_start_logits[feature_index] - end_logits = all_end_logits[feature_index] - # This is what will allow us to map some the positions in our logits to span of texts in the original - # context. - offset_mapping = features[feature_index]["offset_mapping"] - # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context - # available in the current feature. - token_is_max_context = features[feature_index].get("token_is_max_context", None) - - # Update minimum null prediction. - feature_null_score = start_logits[0] + end_logits[0] - if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: - min_null_prediction = { - "offsets": (0, 0), - "score": feature_null_score, - "start_logit": start_logits[0], - "end_logit": end_logits[0], - } - - # Go through all possibilities for the `n_best_size` greater start and end logits. - start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() - end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() - for start_index in start_indexes: - for end_index in end_indexes: - # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond - # to part of the input_ids that are not in the context. - if ( - start_index >= len(offset_mapping) - or end_index >= len(offset_mapping) - or offset_mapping[start_index] is None - or offset_mapping[end_index] is None - ): - continue - # Don't consider answers with a length that is either < 0 or > max_answer_length. - if end_index < start_index or end_index - start_index + 1 > max_answer_length: - continue - # Don't consider answer that don't have the maximum context available (if such information is - # provided). - if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): - continue - prelim_predictions.append( - { - "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), - "score": start_logits[start_index] + end_logits[end_index], - "start_logit": start_logits[start_index], - "end_logit": end_logits[end_index], - } - ) - if version_2_with_negative: - # Add the minimum null prediction - prelim_predictions.append(min_null_prediction) - null_score = min_null_prediction["score"] - - # Only keep the best `n_best_size` predictions. - predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] - - # Add back the minimum null prediction if it was removed because of its low score. - if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions): - predictions.append(min_null_prediction) - - # Use the offsets to gather the answer text in the original context. - context = example["context"] - for pred in predictions: - offsets = pred.pop("offsets") - pred["text"] = context[offsets[0] : offsets[1]] - - # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid - # failure. - if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): - predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) - - # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using - # the LogSumExp trick). - scores = np.array([pred.pop("score") for pred in predictions]) - exp_scores = np.exp(scores - np.max(scores)) - probs = exp_scores / exp_scores.sum() - - # Include the probabilities in our predictions. - for prob, pred in zip(probs, predictions): - pred["probability"] = prob - - # Pick the best prediction. If the null answer is not possible, this is easy. - if not version_2_with_negative: - all_predictions[example["id"]] = predictions[0]["text"] - else: - # Otherwise we first need to find the best non-empty prediction. - i = 0 - while predictions[i]["text"] == "": - i += 1 - best_non_null_pred = predictions[i] - - # Then we compare to the null prediction using the threshold. - score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] - scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. - if score_diff > null_score_diff_threshold: - all_predictions[example["id"]] = "" - else: - all_predictions[example["id"]] = best_non_null_pred["text"] - - # Make `predictions` JSON-serializable by casting np.float back to float. - all_nbest_json[example["id"]] = [ - {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} - for pred in predictions - ] - - # If we have an output_dir, let's save all those dicts. - if output_dir is not None: - if not os.path.isdir(output_dir): - raise EnvironmentError(f"{output_dir} is not a directory.") - - prediction_file = os.path.join( - output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" - ) - nbest_file = os.path.join( - output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" - ) - if version_2_with_negative: - null_odds_file = os.path.join( - output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" - ) - - logger.info(f"Saving predictions to {prediction_file}.") - with open(prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - logger.info(f"Saving nbest_preds to {nbest_file}.") - with open(nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - if version_2_with_negative: - logger.info(f"Saving null_odds to {null_odds_file}.") - with open(null_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - return all_predictions - - -def postprocess_qa_predictions_with_beam_search( - examples, - features, - predictions: Tuple[np.ndarray, np.ndarray], - version_2_with_negative: bool = False, - n_best_size: int = 20, - max_answer_length: int = 30, - start_n_top: int = 5, - end_n_top: int = 5, - output_dir: Optional[str] = None, - prefix: Optional[str] = None, - log_level: Optional[int] = logging.WARNING, -): - """ - Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the - original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as - cls token predictions. - - Args: - examples: The non-preprocessed dataset (see the main script for more information). - features: The processed dataset (see the main script for more information). - predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): - The predictions of the model: two arrays containing the start logits and the end logits respectively. Its - first dimension must match the number of elements of :obj:`features`. - version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the underlying dataset contains examples with no answers. - n_best_size (:obj:`int`, `optional`, defaults to 20): - The total number of n-best predictions to generate when looking for an answer. - max_answer_length (:obj:`int`, `optional`, defaults to 30): - The maximum length of an answer that can be generated. This is needed because the start and end predictions - are not conditioned on one another. - start_n_top (:obj:`int`, `optional`, defaults to 5): - The number of top start logits too keep when searching for the :obj:`n_best_size` predictions. - end_n_top (:obj:`int`, `optional`, defaults to 5): - The number of top end logits too keep when searching for the :obj:`n_best_size` predictions. - output_dir (:obj:`str`, `optional`): - If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if - :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null - answers, are saved in `output_dir`. - prefix (:obj:`str`, `optional`): - If provided, the dictionaries mentioned above are saved with `prefix` added to their names. - log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): - ``logging`` log level (e.g., ``logging.WARNING``) - """ - if len(predictions) != 5: - raise ValueError("`predictions` should be a tuple with five elements.") - start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions - - if len(predictions[0]) != len(features): - raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") - - # Build a map example to its corresponding features. - example_id_to_index = {k: i for i, k in enumerate(examples["id"])} - features_per_example = collections.defaultdict(list) - for i, feature in enumerate(features): - features_per_example[example_id_to_index[feature["example_id"]]].append(i) - - # The dictionaries we have to fill. - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() if version_2_with_negative else None - - # Logging. - logger.setLevel(log_level) - logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") - - # Let's loop over all the examples! - for example_index, example in enumerate(tqdm(examples)): - # Those are the indices of the features associated to the current example. - feature_indices = features_per_example[example_index] - - min_null_score = None - prelim_predictions = [] - - # Looping through all the features associated to the current example. - for feature_index in feature_indices: - # We grab the predictions of the model for this feature. - start_log_prob = start_top_log_probs[feature_index] - start_indexes = start_top_index[feature_index] - end_log_prob = end_top_log_probs[feature_index] - end_indexes = end_top_index[feature_index] - feature_null_score = cls_logits[feature_index] - # This is what will allow us to map some the positions in our logits to span of texts in the original - # context. - offset_mapping = features[feature_index]["offset_mapping"] - # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context - # available in the current feature. - token_is_max_context = features[feature_index].get("token_is_max_context", None) - - # Update minimum null prediction - if min_null_score is None or feature_null_score < min_null_score: - min_null_score = feature_null_score - - # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits. - for i in range(start_n_top): - for j in range(end_n_top): - start_index = int(start_indexes[i]) - j_index = i * end_n_top + j - end_index = int(end_indexes[j_index]) - # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the - # p_mask but let's not take any risk) - if ( - start_index >= len(offset_mapping) - or end_index >= len(offset_mapping) - or offset_mapping[start_index] is None - or offset_mapping[end_index] is None - ): - continue - # Don't consider answers with a length negative or > max_answer_length. - if end_index < start_index or end_index - start_index + 1 > max_answer_length: - continue - # Don't consider answer that don't have the maximum context available (if such information is - # provided). - if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): - continue - prelim_predictions.append( - { - "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), - "score": start_log_prob[i] + end_log_prob[j_index], - "start_log_prob": start_log_prob[i], - "end_log_prob": end_log_prob[j_index], - } - ) - - # Only keep the best `n_best_size` predictions. - predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] - - # Use the offsets to gather the answer text in the original context. - context = example["context"] - for pred in predictions: - offsets = pred.pop("offsets") - pred["text"] = context[offsets[0] : offsets[1]] - - # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid - # failure. - if len(predictions) == 0: - predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6}) - - # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using - # the LogSumExp trick). - scores = np.array([pred.pop("score") for pred in predictions]) - exp_scores = np.exp(scores - np.max(scores)) - probs = exp_scores / exp_scores.sum() - - # Include the probabilities in our predictions. - for prob, pred in zip(probs, predictions): - pred["probability"] = prob - - # Pick the best prediction and set the probability for the null answer. - all_predictions[example["id"]] = predictions[0]["text"] - if version_2_with_negative: - scores_diff_json[example["id"]] = float(min_null_score) - - # Make `predictions` JSON-serializable by casting np.float back to float. - all_nbest_json[example["id"]] = [ - {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} - for pred in predictions - ] - - # If we have an output_dir, let's save all those dicts. - if output_dir is not None: - if not os.path.isdir(output_dir): - raise EnvironmentError(f"{output_dir} is not a directory.") - - prediction_file = os.path.join( - output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" - ) - nbest_file = os.path.join( - output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" - ) - if version_2_with_negative: - null_odds_file = os.path.join( - output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" - ) - - logger.info(f"Saving predictions to {prediction_file}.") - with open(prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - logger.info(f"Saving nbest_preds to {nbest_file}.") - with open(nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - if version_2_with_negative: - logger.info(f"Saving null_odds to {null_odds_file}.") - with open(null_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - return all_predictions, scores_diff_json \ No newline at end of file diff --git a/scripts/exp-007/madx_run_clm.py b/scripts/exp-007/madx_run_clm.py deleted file mode 100644 index 581b07c..0000000 --- a/scripts/exp-007/madx_run_clm.py +++ /dev/null @@ -1,593 +0,0 @@ -""" -Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py -""" - -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import torch -import pathlib - -import datasets -from datasets import load_dataset - -import transformers -import transformers.adapters.composition as ac -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AdapterTrainer, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - MultiLingAdapterArguments, - Trainer, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.adapters.configuration import AdapterConfig -from transformers.testing_utils import CaptureLogger -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.11.0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - block_size: Optional[int] = field( - default=None, - metadata={ - "help": "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) - - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, adapter_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) - else: - model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"model_args {model_args}") - logger.info(f"data_args {data_args}") - logger.info(f"Training/evaluation parameters {training_args}") - logger.info(f"Adapter parameters {adapter_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - model.resize_token_embeddings(len(tokenizer)) - - # Setup adapters - if adapter_args.train_adapter: - task_name = data_args.dataset_name or "clm" - task_name += f"_{adapter_args.language}" - # check if adapter already exists, otherwise add it - if task_name not in model.config.adapters: - # resolve the adapter config - adapter_config = AdapterConfig.load( - adapter_args.adapter_config, - non_linearity=adapter_args.adapter_non_linearity, - reduction_factor=adapter_args.adapter_reduction_factor, - ) - # load a pre-trained from Hub if specified - if adapter_args.load_adapter: - model.load_adapter( - adapter_args.load_adapter, - config=adapter_config, - load_as=task_name, - ) - # otherwise, add a fresh adapter - else: - model.add_adapter(task_name, config=adapter_config) - # optionally load a pre-trained language adapter - if adapter_args.load_lang_adapter: - # resolve the language adapter config - lang_adapter_config = AdapterConfig.load( - adapter_args.lang_adapter_config, - non_linearity=adapter_args.lang_adapter_non_linearity, - reduction_factor=adapter_args.lang_adapter_reduction_factor, - ) - # load the language adapter from Hub - lang_adapter_name = model.load_adapter( - adapter_args.load_lang_adapter, - config=lang_adapter_config, - load_as=adapter_args.language, - ) - else: - lang_adapter_name = None - # Freeze all model weights except of those of this adapter - model.train_adapter([task_name]) - # Set the adapters to be used in every forward pass - if lang_adapter_name: - model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) - else: - model.set_active_adapters(task_name) - else: - if adapter_args.load_adapter or adapter_args.load_lang_adapter: - raise ValueError( - "Adapters can only be loaded in adapters training mode." - "Use --train_adapter to enable adapter training" - ) - - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - - def tokenize_function(examples): - with CaptureLogger(tok_logger) as cl: - output = tokenizer(examples[text_column_name]) - # clm input could be much much longer than block_size - if "Token indices sequence length is longer than the" in cl.out: - tok_logger.warning( - "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." - ) - return output - - with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.output_dir}/tokenized_datasets.pt") - saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): - tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - logger.info("Sanity check: loaded tokenized_datasets") - else: - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - logger.info("Sanity check: saved tokenized_datasets") - - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.output_dir}/lm_datasets.pt") - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): - lm_datasets = torch.load(str(saved_lm_datasets_fp)) - logger.info("Sanity check: loaded lm_datasets") - else: - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - torch.save(lm_datasets, saved_lm_datasets_fp) - logger.info("Sanity check: saved lm_datasets") - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = lm_datasets["train"] - if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_dataset = lm_datasets["validation"] - if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - # Initialize our Trainer - trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer - trainer = trainer_class( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - ) - - logger.info(model) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - if training_args.push_to_hub: - trainer.push_to_hub(**kwargs) - else: - trainer.create_model_card(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/exp-007/run_clm_de.sh b/scripts/exp-007/run_clm_de.sh deleted file mode 100644 index f1aa596..0000000 --- a/scripts/exp-007/run_clm_de.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=5-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:4 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-007-run_clm_de_madx - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_de_madx.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_de_madx.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -model_name="gpt2" -tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/exp-005/oscar-de-tokenizer" -cache_dir="${FP_BIGS}/data/external/oscar_de" -output_dir="${FP_BIGS}/data/processed/exp-007/madx-gpt2-de" -logging_dir="${FP_BIGS}/reports/exp-007/madx-gpt2-de" - -python $FP_BIGS/scripts/exp-007/madx_run_clm.py \ - --model_name_or_path $model_name \ - --tokenizer_name $tokenizer_dir \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_de \ - --logging_dir $logging_dir \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 1000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ - --train_adapter \ - --adapter_config "pfeiffer+inv" \ - --language "de" \ - --num_train_epochs 6.0 \ No newline at end of file diff --git a/scripts/exp-007/run_clm_en.sh b/scripts/exp-007/run_clm_en.sh deleted file mode 100644 index 776222f..0000000 --- a/scripts/exp-007/run_clm_en.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=5-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:4 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-007-run_clm_en_madx - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_en_madx.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_en_madx.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -model_name="gpt2" -tokenizer_dir="gpt2" -cache_dir="${FP_BIGS}/data/external/oscar_en" -output_dir="${FP_BIGS}/data/processed/exp-007/madx-gpt2-en" -logging_dir="${FP_BIGS}/reports/exp-007/madx-gpt2-en" - -python $FP_BIGS/scripts/exp-007/madx_run_clm.py \ - --model_name_or_path $model_name \ - --tokenizer_name $tokenizer_dir \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_en \ - --logging_dir $logging_dir \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 1000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ - --train_adapter \ - --train_adapter \ - --adapter_config "pfeiffer+inv" \ - --language "en" \ - --num_train_epochs 6.0 \ No newline at end of file diff --git a/scripts/exp-007/run_clm_ko.sh b/scripts/exp-007/run_clm_ko.sh deleted file mode 100644 index 611b4f2..0000000 --- a/scripts/exp-007/run_clm_ko.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=5-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=3090-gcondo --gres=gpu:4 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-007-run_clm_ko_madx - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_ko_madx.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/007/run_clm_ko_madx.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_adapter/bin/activate - -model_name="gpt2" -tokenizer_dir="yongzx/gpt2-finetuned-oscar-ko" -cache_dir="${FP_BIGS}/data/external/oscar_ko" -output_dir="${FP_BIGS}/data/processed/exp-007/madx-gpt2-ko" -logging_dir="${FP_BIGS}/reports/exp-007/madx-gpt2-ko" -# ckpt_dir="${FP_BIGS}/data/processed/exp-007/ft-gpt2-2/checkpoint-25000" - -python $FP_BIGS/scripts/exp-007/madx_run_clm.py \ - --model_name_or_path $model_name \ - --tokenizer_name $tokenizer_dir \ - --dataset_name oscar \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_ko \ - --logging_dir $logging_dir \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir $output_dir \ - --preprocessing_num_workers 8 \ - --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ - --eval_steps 1000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ - --train_adapter \ - --adapter_config "pfeiffer+inv" \ - --language "ko" \ - --num_train_epochs 6.0 \ No newline at end of file diff --git a/scripts/exp-008/xnli/xnli_de.py b/scripts/exp-008/xnli/xnli_de.py deleted file mode 100644 index e21cee8..0000000 --- a/scripts/exp-008/xnli/xnli_de.py +++ /dev/null @@ -1,151 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import BertTokenizer, BertForSequenceClassification - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -print("Arguments: ========") -print(args) - -# load dataset -if args.zero_shot: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = BertTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, pad_to_max_length=True) - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.sep_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -logger.info("Tokenizing the dataset...") -# tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = train_dataset.map(tokenize_function, batched=False) -full_val_dataset = val_dataset.map(tokenize_function, batched=False) -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500 if not args.use_partial_data else 10, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="steps", - save_strategy="steps", - logging_strategy="steps", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(args): - return BertForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=0, - cache_dir=args.cache_dir) - - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - - model = load_model(args) - training_args.report_to = list() - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-008/xnli/xnli_de_mbert.sh b/scripts/exp-008/xnli/xnli_de_mbert.sh deleted file mode 100644 index e6d2ca4..0000000 --- a/scripts/exp-008/xnli/xnli_de_mbert.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-008-xnli_de_mbert - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="bert-base-multilingual-uncased" - TOKENIZER_NAME="bert-base-multilingual-uncased" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_${lr}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-008/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/exp-008/xnli/xnli_de_mbert_0shot.sh b/scripts/exp-008/xnli/xnli_de_mbert_0shot.sh deleted file mode 100644 index 0783b53..0000000 --- a/scripts/exp-008/xnli/xnli_de_mbert_0shot.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-008-xnli_de_mbert_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_de_mbert_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="bert-base-multilingual-uncased" - TOKENIZER_NAME="bert-base-multilingual-uncased" - LANG="de" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_0shot_${lr}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-008/xnli/xnli_de.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --zero_shot -done diff --git a/scripts/exp-008/xnli/xnli_ko.py b/scripts/exp-008/xnli/xnli_ko.py deleted file mode 100644 index 5ebee8c..0000000 --- a/scripts/exp-008/xnli/xnli_ko.py +++ /dev/null @@ -1,197 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer -from transformers import BertTokenizer, BertForSequenceClassification - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -KLUE = namedtuple("KLUE", ["klue_split", "num_labels", "metric", "model_type"]) -KLUE_TASKS = { - "topic-cls": KLUE(klue_split="ynat", num_labels=7, metric="f1/macro", model_type="seq-cls"), - "sts-pearsonr": KLUE(klue_split="sts", num_labels=1, metric="pearsonr", model_type="seq-cls"), - "sts-binary": KLUE(klue_split="sts", num_labels=1, metric="f1/macro", model_type="seq-cls"), - "nli": KLUE(klue_split="nli", num_labels=3, metric="accuracy", model_type="seq-cls"), -} - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--klue_task", choices=KLUE_TASKS.keys(), default="nli") -parser.add_argument("--lang", type=str, default="ko") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -print("Arguments: ========") -print(args) - -# load dataset -klue_dataset = load_dataset("klue", KLUE_TASKS[args.klue_task].klue_split, cache_dir=args.cache_dir) -if args.zero_shot: - print("0️⃣ 0-Shot") - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - - if "test" not in klue_dataset: - _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = klue_dataset['validation'] - else: - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = klue_dataset['test'] -else: - print("👀 Supervised Training") - if "test" not in klue_dataset: - _train_dataset = klue_dataset['train'].train_test_split(train_size=0.8, shuffle=True, seed=42) - train_dataset = _train_dataset['train'] - val_dataset = _train_dataset['test'] - test_dataset = klue_dataset['validation'] - else: - train_dataset = klue_dataset['train'] - val_dataset = klue_dataset['validation'] - test_dataset = klue_dataset['test'] - - -# load tokenizer -tokenizer = BertTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) - -def tokenize_function(examples): - if KLUE_TASKS[args.klue_task].klue_split == "ynat": - return tokenizer(examples["title"], max_length=128, padding="max_length", truncation=True) - elif KLUE_TASKS[args.klue_task].klue_split == "sts": - return tokenizer(f'{examples["sentence1"]} {tokenizer.sep_token} {examples["sentence2"]}', max_length=128, padding="max_length", truncation=True) - elif KLUE_TASKS[args.klue_task].klue_split == "nli": - return tokenizer(f'{examples["premise"]} {tokenizer.sep_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def postprocessing(example): - if KLUE_TASKS[args.klue_task].klue_split == "sts": - example['labels'] = example['labels']['real-label'] - return example - else: - return example - -logger.info("Tokenizing the dataset...") -# tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -full_train_dataset = train_dataset.map(tokenize_function, batched=False).map(postprocessing) -full_val_dataset = val_dataset.map(tokenize_function, batched=False).map(postprocessing) -full_test_dataset = test_dataset.map(tokenize_function, batched=False).map(postprocessing) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - -def compute_metrics(eval_pred): - logits, labels = eval_pred - - if "pearsonr" in KLUE_TASKS[args.klue_task].metric: - predictions = logits.flatten() - else: - predictions = np.argmax(logits, axis=-1) - - ### only for STS-binary - if args.klue_task == "sts-binary": - predictions = np.where(logits.flatten() > 3.0, 1, 0) - labels = np.where(labels > 3.0, 1, 0) - # print(predictions) - # print(labels) - # assert False - - # apply metric - metric = load_metric(KLUE_TASKS[args.klue_task].metric.split("/")[0]) - if "/" in KLUE_TASKS[args.klue_task].metric: - return metric.compute(predictions=predictions, - references=labels, - average=KLUE_TASKS[args.klue_task].metric.split("/")[1]) - else: - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=1, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - - -def load_model(args): - if KLUE_TASKS[args.klue_task].model_type == "seq-cls": - return BertForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=KLUE_TASKS[args.klue_task].num_labels, - pad_token_id=0, - cache_dir=args.cache_dir) - - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_model = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - model = load_model(args) - training_args.report_to = list() - - trainer = Trainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/exp-008/xnli/xnli_ko_mbert.sh b/scripts/exp-008/xnli/xnli_ko_mbert.sh deleted file mode 100644 index 37def1a..0000000 --- a/scripts/exp-008/xnli/xnli_ko_mbert.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-008-xnli_ko_mbert - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="bert-base-multilingual-uncased" - TOKENIZER_NAME="bert-base-multilingual-uncased" - LANG="ko" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_${lr}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-008/xnli/xnli_ko.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train -done diff --git a/scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh b/scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh deleted file mode 100644 index af3686d..0000000 --- a/scripts/exp-008/xnli/xnli_ko_mbert_0shot.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Request half an hour of runtime: -#SBATCH --time=1-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=2 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-008-xnli_ko_mbert_0shot - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert_0shot.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/008/xnli_ko_mbert_0shot.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -source $FP_BIGS/env_lang_mod/bin/activate - -# learning_rates=( 1e-5 5e-5 1e-6 5e-6 ) -learning_rates=( 1e-5 ) -for lr in ${learning_rates[@]} ; do - echo "LR ===== $lr" - MODEL_NAME="bert-base-multilingual-uncased" - TOKENIZER_NAME="bert-base-multilingual-uncased" - LANG="ko" - OUTPUT_DIR="$FP_BIGS/data/processed/exp-008/xnli/$LANG/xnli_${LANG}_mbert_0shot_${lr}" - CACHE_DIR="$FP_BIGS/data/external/xnli" - mkdir -p $OUTPUT_DIR - - python $FP_BIGS/scripts/exp-008/xnli/xnli_ko.py \ - $OUTPUT_DIR \ - --lang $LANG \ - --cache_dir $CACHE_DIR \ - --num_train_epochs 10 \ - --learning_rate $lr \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --pretrained_model $MODEL_NAME \ - --tokenizer $TOKENIZER_NAME \ - --do_train \ - --do_eval_after_train \ - --zero_shot -done From 7659d05c046c9f24ae19bfd84fd45011b2538c0f Mon Sep 17 00:00:00 2001 From: yongzx Date: Fri, 6 May 2022 08:03:32 -0400 Subject: [PATCH 064/142] update madx_run_clm --- scripts/lang_adapt/madx_run_clm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index de46184..2a5e26a 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -437,7 +437,6 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): config=adapter_config, load_as=task_name, ) - # otherwise, add a fresh adapter else: model.add_adapter(task_name, config=adapter_config) # optionally load a pre-trained language adapter @@ -484,6 +483,10 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): frozen_params = 0 emb_params = 0 for name, param in model.named_parameters(): + if "wte" in name or "wpe" in name: + param.requires_grad = True + emb_params += param.numel() + if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") frozen_params += param.numel() @@ -583,7 +586,8 @@ def main(): data_collator=default_data_collator ) - logger.info(model) + print("Model: 👇") + print(model) # Training if training_args.do_train: From a47e65b290109bfc8bbb946b9567d46dd2d3f8c6 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 6 May 2022 15:35:58 +0200 Subject: [PATCH 065/142] adapted xnli script to properly load wte, wpe and adapters --- scripts/eval_xnli/adapters_xnli_de_vn.py | 228 +++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 scripts/eval_xnli/adapters_xnli_de_vn.py diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py new file mode 100644 index 0000000..45ae562 --- /dev/null +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -0,0 +1,228 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.do_train: + if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + + + small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) + small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) + logger.info(full_train_dataset[0]) + logger.info(full_train_dataset[100]) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + if not args.original_model == args.pretrained_model: + wte = torch.load(f'{args.pretrained_model}/embedding.pt') + wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') + + model = GPT2ForSequenceClassification.from_pretrained(args.original_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if not args.zero_shot: + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.pretrained_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe + if args.madx_lang_adapter: + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) + + if not inference: + #if not args.zero_shot: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) From e9ca92f78b99ea8147571f15e4abe662395e2b90 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Fri, 6 May 2022 15:37:39 +0200 Subject: [PATCH 066/142] updated the way we save the model; added fp16 training --- scripts/lang_adapt/madx_run_clm.py | 4 +++- scripts/lang_adapt/run_clm_adpt_vn.sh | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 2a5e26a..5e030ac 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -600,7 +600,9 @@ def main(): train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) - trainer.model.save_embeddings(f'{trainer.args.output_dir}/embedding_layer') + # save embedding and positional embedding (which is not saved by trainer) + trainer.model.save_embeddings(trainer.args.output_dir, 'lng_emb') + torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/positional_embedding.pt') metrics = train_result.metrics diff --git a/scripts/lang_adapt/run_clm_adpt_vn.sh b/scripts/lang_adapt/run_clm_adpt_vn.sh index 44d12af..c585c22 100644 --- a/scripts/lang_adapt/run_clm_adpt_vn.sh +++ b/scripts/lang_adapt/run_clm_adpt_vn.sh @@ -28,7 +28,7 @@ lng=$2 adapter_reduction_factor=$3 dataset=oscar adapter_config="pfeiffer+inv" -vocabsize=24000 +vocabsize=1000 model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" tokenizer_dir="${FP_BIGS}/tokenizers/${lng}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k cache_dir="${FP_BIGS}/data/" @@ -70,9 +70,9 @@ python $FP_BIGS/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ --evaluation_strategy "epoch" \ --max_eval_samples 5000 \ --save_steps 10000 \ - --save_strategy "steps" \ - --save_total_limit 3 \ - --max_train_samples $data_sample \ + --save_strategy "epoch" \ + --save_total_limit 3 \ + --max_train_samples ${data_sample}\ --max_steps 50000 \ --train_adapter \ --load_best_model_at_end \ @@ -80,4 +80,4 @@ python $FP_BIGS/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ --embedding_strategies "overlap-replace" \ --adapter_reduction_factor $adapter_reduction_factor \ --adapter_config ${adapter_config} \ - --language $lng + --language $lng &> $output_dir/train.log From 91f739a4c9eadff31f52a8cf02641a8e6944d81c Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 11 May 2022 00:31:40 -0400 Subject: [PATCH 067/142] change zero_shot to cross_lingual --- scripts/eval_xnli/adapters_xnli_de_vn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index 45ae562..d634e5c 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -34,7 +34,7 @@ parser.add_argument("--do_eval_after_train", default=False, action="store_true") parser.add_argument("--do_predict", default=False, action="store_true") parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") +parser.add_argument("--cross_lingual", default=False, action="store_true") finetune_strategies = ["whole", "lang_adapters", "task_adapters"] parser.add_argument("--madx_lang_adapter") @@ -54,7 +54,7 @@ # load dataset -if args.zero_shot: +if args.cross_lingual: print("0️⃣ 0-Shot") # 0-shot: use english as train and validation xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) @@ -75,7 +75,7 @@ # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: +if args.cross_lingual: en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer en_tokenizer.pad_token = en_tokenizer.eos_token @@ -88,7 +88,7 @@ def en_tokenize_function(examples): logger.info("Tokenizing the dataset...") if args.do_train: - if args.zero_shot: + if args.cross_lingual: full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) else: @@ -144,7 +144,7 @@ def load_model(args, inference=False): pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir) - if not args.zero_shot: + if not args.cross_lingual: causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) if not args.original_model == args.pretrained_model: @@ -156,7 +156,7 @@ def load_model(args, inference=False): model.set_active_adapters(adapter_name) if not inference: - #if not args.zero_shot: normally need to add adapter in any case + #if not args.cross_lingual: normally need to add adapter in any case # normally this is already done, why use adapter_lang_name here? #if args.madx_lang_adapter: # adapter_name = model.load_adapter(args.madx_lang_adapter, From aa5256e7e41fb30e8ec168113a012b9f5efa8b0e Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 11 May 2022 00:46:12 -0400 Subject: [PATCH 068/142] load language adapters during inference setting --- scripts/eval_xnli/adapters_xnli_de_vn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index d634e5c..b547509 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -144,7 +144,7 @@ def load_model(args, inference=False): pad_token_id=en_tokenizer.pad_token_id, cache_dir=args.cache_dir) - if not args.cross_lingual: + if inference or not args.cross_lingual: causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) if not args.original_model == args.pretrained_model: From 630f2f68812e0b22b8af0af17e6ab18ed303c3e6 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:19:56 +0200 Subject: [PATCH 069/142] updated tokenizer training script --- scripts/lang_adapt/tokenized4clm_sampled.py | 39 +++++++++++++++---- scripts/lang_adapt/train_tokenizer_scratch.sh | 17 ++++++++ scripts/lang_adapt/train_tokenizer_update.sh | 17 ++++++++ 3 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 scripts/lang_adapt/train_tokenizer_scratch.sh create mode 100644 scripts/lang_adapt/train_tokenizer_update.sh diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 672277a..775815e 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -30,6 +30,7 @@ parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) parser.add_argument('--vocab_size', default=130_000, type=int) parser.add_argument('--extend_vocab', action='store_true') +parser.add_argument('--replace_with_overlap', action='store_true') # this is not working as expected parser.add_argument('--sample_size', default=None, type=int) args = parser.parse_args() @@ -54,14 +55,38 @@ print(f"✅ Loaded raw_datasets OSCAR language {lang}") def batch_iterator(): + global unique_toks batch_size = 1000 for i in range(0, len(raw_datasets), batch_size): - yield raw_datasets[i : i + batch_size]["text"] + sample = raw_datasets[i : i + batch_size]["text"] + unique_toks = unique_toks.union(set(" ".join(sample).split(" "))) + yield sample -tokenizer = AutoTokenizer.from_pretrained("gpt2") -assert tokenizer.is_fast -new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) -print("✅ Trained tokenizer with len ", len(new_tokenizer)) +unique_toks = set() -new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") -print(f"✅ Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}") +if args.extend_vocab: + tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + assert tokenizer.is_fast + new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) + print("✅ Trained tokenizer with len ", len(new_tokenizer)) + added = tokenizer.add_tokens([tok for tok in new_tokenizer.vocab.keys()]) + print(f"Overlap with previous vocab: {args.vocab_size - added}") + tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") +elif args.replace_with_overlap: + # This setting is not really working properly: we need to save the new_tokenizer, but add somehow token that can be used at inference which I don't know how to do (so that it is also get used at tokenization step properly + tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + + assert tokenizer.is_fast + new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) + print("✅ Trained tokenizer with len ", len(new_tokenizer)) + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") +else: + tokenizer = AutoTokenizer.from_pretrained('gpt2') + assert tokenizer.is_fast + new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) + print("Unique toks, ", len(unique_toks)) + print("✅ Trained tokenizer with len ", len(new_tokenizer)) + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_scratch") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_scratch") diff --git a/scripts/lang_adapt/train_tokenizer_scratch.sh b/scripts/lang_adapt/train_tokenizer_scratch.sh new file mode 100644 index 0000000..354efbb --- /dev/null +++ b/scripts/lang_adapt/train_tokenizer_scratch.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +lng=$1 +sample_size=$2 +vocab_size=$3 +source $bs_dir/multilingual-modeling/scripts/env/bin/activate +python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size + diff --git a/scripts/lang_adapt/train_tokenizer_update.sh b/scripts/lang_adapt/train_tokenizer_update.sh new file mode 100644 index 0000000..4c08242 --- /dev/null +++ b/scripts/lang_adapt/train_tokenizer_update.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +lng=$1 +sample_size=$2 +vocab_size=$3 +source $bs_dir/multilingual-modeling/scripts/env/bin/activate +python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size --extend_vocab + From c74453157b0b3ba5b50e1c0da715c37991a49860 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:36:30 +0200 Subject: [PATCH 070/142] added xnli zero shot training and eval scripts --- scripts/eval_xnli/run_eval_xnli_zero_shot.sh | 67 ++++++++++++++++++++ scripts/eval_xnli/train_xnli_en.sh | 66 +++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 scripts/eval_xnli/run_eval_xnli_zero_shot.sh create mode 100644 scripts/eval_xnli/train_xnli_en.sh diff --git a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh new file mode 100644 index 0000000..cfd8964 --- /dev/null +++ b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#SBATCH -p gpu +#SBATCH --gres="gpu:1" +#SBATCH --mem=100g + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +# XNLI (Cross-Lingual and Supervised Setting) + +LANG=$1 +data_sample=$2 +vocabsize=$3 +adapter_reduction_factor=$4 + +ch=118500 + + +adapter_config="pfeiffer+inv" +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name +TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +CACHE_DIR="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" +data_tok_dir=${data_dir}/lng_tok + +MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +LR=1e-5 + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_${LANG}" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +outdir=$MODEL_DIR/xnli_eval_zero_shot +# evaluate zero-shot training +python adapters_xnli_de_vn.py \ +$XNLI_ZH_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_DIR \ +--original_model $ORIGINAL_MODEL \ +--tokenizer $TOKENIZER_DIR \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies "task_adapters" \ +--zero_shot &> $XNLI_ZH_DIR/$( basename $data_dir )-$( basename $MODEL_DIR )_eval.log + + + + +#Remove `--zero_shot` for supervised finetuning setting. + +### Zero-shot Prompt-based Setting + +#See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). diff --git a/scripts/eval_xnli/train_xnli_en.sh b/scripts/eval_xnli/train_xnli_en.sh new file mode 100644 index 0000000..8a9445c --- /dev/null +++ b/scripts/eval_xnli/train_xnli_en.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J run_clm_madx + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +# XNLI (Cross-Lingual and Supervised Setting) + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +LANG=$1 +data_sample=$2 +vocabsize=$3 +adapter_reduction_factor=$4 + +ch=118500 + + +adapter_config="pfeiffer+inv" +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name +TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +CACHE_DIR="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" +data_tok_dir=${data_dir}/lng_tok + +MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +OUTPUT_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +LR=1e-5 + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_de" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +mkdir -p $OUTPUT_DIR +python adapters_xnli_de_vn.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_DIR \ +--original_model $ORIGINAL_MODEL \ +--tokenizer $TOKENIZER_DIR \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies "task_adapters" \ +--zero_shot &> $OUTPUT_DIR/train.log + From 045c32d561ea0aa30e3d523253428a776781e38d Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:37:08 +0200 Subject: [PATCH 071/142] added xnli zero shot training and eval scripts --- scripts/eval_xnli/adapters_xnli_de_vn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index b547509..f10e10a 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -135,7 +135,7 @@ def compute_metrics(eval_pred): def load_model(args, inference=False): # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if not args.original_model == args.pretrained_model: + if not args.original_model == args.pretrained_model and not args.zero_shot: wte = torch.load(f'{args.pretrained_model}/embedding.pt') wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') From 3496bec34c4e5bfc0a18b20ec605c1d1bb55b5d9 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 15:41:10 +0200 Subject: [PATCH 072/142] merged with current version --- scripts/eval_xnli/adapters_xnli_de_vn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index f10e10a..880f3b5 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -135,7 +135,7 @@ def compute_metrics(eval_pred): def load_model(args, inference=False): # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if not args.original_model == args.pretrained_model and not args.zero_shot: + if not args.original_model == args.pretrained_model and not args.cross_lingual: wte = torch.load(f'{args.pretrained_model}/embedding.pt') wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') @@ -145,6 +145,7 @@ def load_model(args, inference=False): cache_dir=args.cache_dir) if inference or not args.cross_lingual: + # need to load embedding/adapters from the model adapted to the new language causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) if not args.original_model == args.pretrained_model: From cc4b11354da9abaa5d7bd3b2c8f3fd78c337dc88 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 16:49:24 +0200 Subject: [PATCH 073/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 93 +++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 scripts/lang_adapt/compute_tok_overlap.py diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py new file mode 100644 index 0000000..23bd70c --- /dev/null +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -0,0 +1,93 @@ +import sys +import json +import datasets +from datasets import load_dataset +from transformers import AutoTokenizer +import numpy as np +from collections import defaultdict +import math +import argparse +import matplotlib.pyplot as plt + +def get_en_tokenizer(): + en_tok = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + return en_tok + +def getdata(lng): + flores_path="/tmp-network/user/vnikouli/Projects/NLE-NMT/data/test_sets/" + with open(f'{flores_path}/FLORES-valid.{lng}') as f: + dataset = f.readlines() + return dataset + +def gettokens(tok, dataset): + from collections import defaultdict + seq_lengths = [] + toks_occ = defaultdict(int) + for i,l in enumerate(dataset): + toks = tok.tokenize(l.strip()) + seq_lengths.append(len(toks)) + toks_occ.update({t:toks_occ[t]+1 for t in toks }) + return np.array(seq_lengths), toks_occ + + + +def plot_histogram(tokoccs, name, ax, nb_bins): + ax.hist(tokoccs, nb_bins, histtype='bar', label=name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--lang', type=str, required=True) + parser.add_argument('--tokenizers', type=str, nargs='+', + help='an integer for the accumulator') + parser.add_argument('--plot_name', type=str, default="stats_plot") + args = parser.parse_args() + lng = args.lang + tokenizers = args.tokenizers + vocabs = {} + dataset=getdata(lng) + en_dataset = getdata("en") + seq_lengths = {} + tok_occs = {} + en_tok = get_en_tokenizer() + sl, to = gettokens(en_tok, en_dataset) + seq_lengths['en'] = sl + + for t in tokenizers: + tok = AutoTokenizer.from_pretrained(t) + sl, to = gettokens(tok, dataset) + seq_lengths[t] = sl + tok_occs[t] = to + with open(f'{t}/vocab.json') as jsonFile: + vocab = json.load(jsonFile) + vocabs[t] = set(vocab.keys()) + + + print("Print tokenization stats") + print("===============================") + fig, ax = plt.subplots(1, 4, figsize=(40, 10)) + for t in tokenizers: + print(f'Tokenizer {t}, avg tokenized seq length: {np.mean(seq_lengths[t])} (shorter sequences are better)') + #we want to decompose sentence in {lng} in approximately the same nb of tokens as in English hoping that it will favour knowledge transfer + x = seq_lengths[t]/seq_lengths["en"] + print(f'Tokenizer {t}, avg ratio with En tokenized sentence length: {np.mean(x)}+/- {np.std(x)}') + baseline_overlap = vocabs[t].intersection(set(en_tok.vocab.keys())) + print(f"Overlap with original tokenizer vocab : {len(baseline_overlap)} ") + print(f"Overlap between new tokenizer vocab and obtained tokenswith original tokenizer vocab : {len(baseline_overlap)} ") + + + + print("Do plotting") + fig, ax = plt.subplots(1, 4, figsize=(40, 10)) + ax[0].set_title("Token occ distribution") + plot_histogram([[math.log(v) for v in tok_occs[t].values()] for t in tokenizers], tokenizers, ax[0], 10) + ax[1].set_title("Seq length distribution") + plot_histogram([seq_lengths[t] for t in tokenizers], tokenizers, ax[1], 10) + ax[2].set_title("Diff wtih en seq length distribution") + plot_histogram([seq_lengths[t]/seq_lengths["en"] for t in tokenizers], tokenizers, ax[2], 10) + ax[3].set_title("Tok length distribution") + plot_histogram([[len(v) for v in vocabs[t] for i in range(tok_occs[t][v])] for t in tokenizers], tokenizers, ax[3], 10) + ax[1].legend() + fig.savefig(f"{args.plot_name}.png") + + From 4a2d031bcfc9cee9d4e5188f31f4b72c92ed1835 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 16:53:07 +0200 Subject: [PATCH 074/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 30 +++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py index 23bd70c..533d1bb 100644 --- a/scripts/lang_adapt/compute_tok_overlap.py +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -39,8 +39,8 @@ def plot_histogram(tokoccs, name, ax, nb_bins): parser = argparse.ArgumentParser() parser.add_argument('--lang', type=str, required=True) parser.add_argument('--tokenizers', type=str, nargs='+', - help='an integer for the accumulator') - parser.add_argument('--plot_name', type=str, default="stats_plot") + help='list of the tokenizers for which you want to get statstics') + parser.add_argument('--plot_name', type=str, default=None, help="If set generate plots containing tokens distribution across different axes (frequency, length, etc)") args = parser.parse_args() lng = args.lang tokenizers = args.tokenizers @@ -76,18 +76,18 @@ def plot_histogram(tokoccs, name, ax, nb_bins): print(f"Overlap between new tokenizer vocab and obtained tokenswith original tokenizer vocab : {len(baseline_overlap)} ") - - print("Do plotting") - fig, ax = plt.subplots(1, 4, figsize=(40, 10)) - ax[0].set_title("Token occ distribution") - plot_histogram([[math.log(v) for v in tok_occs[t].values()] for t in tokenizers], tokenizers, ax[0], 10) - ax[1].set_title("Seq length distribution") - plot_histogram([seq_lengths[t] for t in tokenizers], tokenizers, ax[1], 10) - ax[2].set_title("Diff wtih en seq length distribution") - plot_histogram([seq_lengths[t]/seq_lengths["en"] for t in tokenizers], tokenizers, ax[2], 10) - ax[3].set_title("Tok length distribution") - plot_histogram([[len(v) for v in vocabs[t] for i in range(tok_occs[t][v])] for t in tokenizers], tokenizers, ax[3], 10) - ax[1].legend() - fig.savefig(f"{args.plot_name}.png") + if args.plot_name: + print("Do plotting") + fig, ax = plt.subplots(1, 4, figsize=(40, 10)) + ax[0].set_title("Token occ distribution") + plot_histogram([[math.log(v) for v in tok_occs[t].values()] for t in tokenizers], tokenizers, ax[0], 10) + ax[1].set_title("Seq length distribution") + plot_histogram([seq_lengths[t] for t in tokenizers], tokenizers, ax[1], 10) + ax[2].set_title("Diff wtih en seq length distribution") + plot_histogram([seq_lengths[t]/seq_lengths["en"] for t in tokenizers], tokenizers, ax[2], 10) + ax[3].set_title("Tok length distribution") + plot_histogram([[len(v) for v in vocabs[t] for i in range(tok_occs[t][v])] for t in tokenizers], tokenizers, ax[3], 10) + ax[1].legend() + fig.savefig(f"{args.plot_name}.png") From 4dd8cea25c7f0277d9b60c31bcc620b7d6a18c26 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 16:58:12 +0200 Subject: [PATCH 075/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py index 533d1bb..a31e504 100644 --- a/scripts/lang_adapt/compute_tok_overlap.py +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -73,7 +73,8 @@ def plot_histogram(tokoccs, name, ax, nb_bins): print(f'Tokenizer {t}, avg ratio with En tokenized sentence length: {np.mean(x)}+/- {np.std(x)}') baseline_overlap = vocabs[t].intersection(set(en_tok.vocab.keys())) print(f"Overlap with original tokenizer vocab : {len(baseline_overlap)} ") - print(f"Overlap between new tokenizer vocab and obtained tokenswith original tokenizer vocab : {len(baseline_overlap)} ") + overlap_vocab_toks = vocabs[t].intersection(set(tok_occs[t].keys())) + print(f"Which portion of new tokenizer was used? : {len(overlap_vocab_toks)}, represents {1.0*len(overlap_vocab_toks}/len(vocabs[t])}% of learnt vocab ") if args.plot_name: From 286706adb12d211df2d7bd52ec486728f954f631 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 17:03:27 +0200 Subject: [PATCH 076/142] added script to get stats about different tokenizers --- scripts/lang_adapt/compute_tok_overlap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/compute_tok_overlap.py b/scripts/lang_adapt/compute_tok_overlap.py index a31e504..8f95394 100644 --- a/scripts/lang_adapt/compute_tok_overlap.py +++ b/scripts/lang_adapt/compute_tok_overlap.py @@ -74,7 +74,7 @@ def plot_histogram(tokoccs, name, ax, nb_bins): baseline_overlap = vocabs[t].intersection(set(en_tok.vocab.keys())) print(f"Overlap with original tokenizer vocab : {len(baseline_overlap)} ") overlap_vocab_toks = vocabs[t].intersection(set(tok_occs[t].keys())) - print(f"Which portion of new tokenizer was used? : {len(overlap_vocab_toks)}, represents {1.0*len(overlap_vocab_toks}/len(vocabs[t])}% of learnt vocab ") + print(f"Which portion of new tokenizer was used? : {len(overlap_vocab_toks)}, represents {100.0*len(overlap_vocab_toks)/len(vocabs[t])}% of learnt vocab ") if args.plot_name: From cf376c6766c5c89801190ced9336e0f2626287a6 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Wed, 11 May 2022 17:41:45 +0200 Subject: [PATCH 077/142] fixed tokenizer training with unk token --- scripts/lang_adapt/tokenized4clm_sampled.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 775815e..71e1fea 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -75,8 +75,8 @@ def batch_iterator(): print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") elif args.replace_with_overlap: # This setting is not really working properly: we need to save the new_tokenizer, but add somehow token that can be used at inference which I don't know how to do (so that it is also get used at tokenization step properly - tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') - + tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/', unk_token="") + assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("✅ Trained tokenizer with len ", len(new_tokenizer)) From 089071d39b49a422de13e928baa8aa85348f65e4 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 12 May 2022 21:24:38 -0400 Subject: [PATCH 078/142] rename pretrained_model to adapted_model --- scripts/eval_xnli/adapters_xnli_de_vn.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py index 880f3b5..3e29ddd 100644 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ b/scripts/eval_xnli/adapters_xnli_de_vn.py @@ -27,7 +27,7 @@ parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") +parser.add_argument("--adapted_model") parser.add_argument("--original_model") parser.add_argument("--tokenizer") parser.add_argument("--do_train", default=False, action="store_true") @@ -46,8 +46,8 @@ args.do_predict = True if args.original_model is None: - # here: because the wpe is not saved, pretrained_model is the original bigsciece model - args.original_model = args.pretrained_model + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model print("Arguments: ========") print(args) @@ -135,9 +135,9 @@ def compute_metrics(eval_pred): def load_model(args, inference=False): # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if not args.original_model == args.pretrained_model and not args.cross_lingual: - wte = torch.load(f'{args.pretrained_model}/embedding.pt') - wpe = torch.load(f'{args.pretrained_model}/positional_embedding.pt') + if not args.original_model == args.adapted_model and not args.cross_lingual: + wte = torch.load(f'{args.adapted_model}/embedding.pt') + wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') model = GPT2ForSequenceClassification.from_pretrained(args.original_model, num_labels=3, @@ -148,7 +148,7 @@ def load_model(args, inference=False): # need to load embedding/adapters from the model adapted to the new language causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) - if not args.original_model == args.pretrained_model: + if not args.original_model == args.adapted_model: causal_lm_model.transformer.wte = wte causal_lm_model.transformer.wpe = wpe if args.madx_lang_adapter: From d969c5c76eea2d506eaf419cf6f381e19273ec39 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 12 May 2022 21:28:24 -0400 Subject: [PATCH 079/142] use updated eval_xnli/adapters_xnli_de_vn.py --- scripts/eval_xnli/adapters_xnli_de.py | 137 ++++++++++++-------------- 1 file changed, 61 insertions(+), 76 deletions(-) diff --git a/scripts/eval_xnli/adapters_xnli_de.py b/scripts/eval_xnli/adapters_xnli_de.py index 46140aa..3e29ddd 100644 --- a/scripts/eval_xnli/adapters_xnli_de.py +++ b/scripts/eval_xnli/adapters_xnli_de.py @@ -27,18 +27,18 @@ parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") +parser.add_argument("--adapted_model") parser.add_argument("--original_model") parser.add_argument("--tokenizer") parser.add_argument("--do_train", default=False, action="store_true") parser.add_argument("--do_eval_after_train", default=False, action="store_true") parser.add_argument("--do_predict", default=False, action="store_true") parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") +parser.add_argument("--cross_lingual", default=False, action="store_true") finetune_strategies = ["whole", "lang_adapters", "task_adapters"] parser.add_argument("--madx_lang_adapter") -parser.add_argument("--adapter_lang_name", required=True) +#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) args = parser.parse_args() @@ -46,21 +46,20 @@ args.do_predict = True if args.original_model is None: - # here: because the wpe is not saved, pretrained_model is the original bigsciece model - args.original_model = args.pretrained_model + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model print("Arguments: ========") print(args) # load dataset -if args.zero_shot: +if args.cross_lingual: print("0️⃣ 0-Shot") # 0-shot: use english as train and validation xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) assert args.lang != "en" - train_dataset = xnli_en_dataset['train'] val_dataset = xnli_en_dataset['validation'] test_dataset = xnli_dataset['test'] @@ -76,7 +75,7 @@ # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: +if args.cross_lingual: en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer en_tokenizer.pad_token = en_tokenizer.eos_token @@ -88,21 +87,23 @@ def en_tokenize_function(examples): logger.info("Tokenizing the dataset...") -if args.zero_shot: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) +if args.do_train: + if args.cross_lingual: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + + + small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) + small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) + logger.info(full_train_dataset[0]) + logger.info(full_train_dataset[100]) full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) -logger.info(full_train_dataset[0]) -logger.info(full_train_dataset[100]) - from datasets import load_metric metric = load_metric("xnli") @@ -132,51 +133,40 @@ def compute_metrics(eval_pred): ) def load_model(args, inference=False): - # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one # even when we call load_adapter - if args.zero_shot and not inference: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - else: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - if not args.zero_shot or (args.zero_shot and inference): - # if not zero shot, that means that we need to replace the embedding layers during training - # we also need to replace embedding layers during inference - causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + if not args.original_model == args.adapted_model and not args.cross_lingual: + wte = torch.load(f'{args.adapted_model}/embedding.pt') + wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') + + model = GPT2ForSequenceClassification.from_pretrained(args.original_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) - # change the embedding layer of the original big science model - # by loading the adapters (which has saved lm_head) + if inference or not args.cross_lingual: + # need to load embedding/adapters from the model adapted to the new language + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.adapted_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe if args.madx_lang_adapter: - causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - - # model has original bigscience embedding so replace it. - model.resize_token_embeddings(len(tokenizer)) - model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) if not inference: - if not args.zero_shot: - if args.madx_lang_adapter: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv", - load_as=args.adapter_lang_name) - if args.finetune_strategies == "whole": - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "lang_adapters": - model.train_adapter([args.adapter_lang_name]) - elif args.finetune_strategies == "task_adapters": - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - else: - raise ValueError("Lack configuration") - + #if not args.cross_lingual: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + print("🔥 ==================== Training: ==================== 🔥") for name, param in model.named_parameters(): if not param.requires_grad: @@ -185,24 +175,19 @@ def load_model(args, inference=False): print(f"🚀 Trainable layer '{name}'") print(model) else: - print("🔥 ==================== Inference: ==================== 🔥") - if args.finetune_strategies == "lang_adapters": - assert args.pretrained_adapters_dir - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.adapter_lang_name}") - model.set_active_adapters(adapter_name) - elif args.finetune_strategies == "task_adapters": - if args.madx_lang_adapter: - assert args.pretrained_adapters_dir - adapter_name = model.load_adapter(args.madx_lang_adapter) - model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - else: - # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") - - # for TGT -> TGT supervised finetuning setting, change adapter_name - adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") - model.set_active_adapters(adapter_name) + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) print(model) return model @@ -241,4 +226,4 @@ def load_model(args, inference=False): compute_metrics=compute_metrics ) - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file + print("Evaluate on Test:", trainer.evaluate()) From a13f08224fc5199eddaf215050e24b2f57482b4e Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 12 May 2022 21:34:47 -0400 Subject: [PATCH 080/142] update XNLI --- scripts/eval_xnli/README.md | 5 ++++- scripts/eval_xnli/run_eval_xnli_zero_shot.sh | 2 +- .../eval_xnli/{train_xnli_en.sh => train_xnli_zero_shot.sh} | 0 3 files changed, 5 insertions(+), 2 deletions(-) rename scripts/eval_xnli/{train_xnli_en.sh => train_xnli_zero_shot.sh} (100%) diff --git a/scripts/eval_xnli/README.md b/scripts/eval_xnli/README.md index 17fc051..f7c1195 100644 --- a/scripts/eval_xnli/README.md +++ b/scripts/eval_xnli/README.md @@ -30,13 +30,16 @@ $OUTPUT_DIR \ --do_train \ --do_eval_after_train \ --madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ ---adapter_lang_name "xnli-de" \ --finetune_strategies $FT_STRATEGIES \ --zero_shot ``` Remove `--zero_shot` for supervised finetuning setting. +Notes: +- `adapters_xnli_de_vn.py` is Vassilina's forked of `adapters_xnli_de.py`. +- `train_xnli_zero_shot.sh` is the batch script for XNLI training, and `run_eval_xnli_zero_shot.sh` is for evaluating trained XNLI task adapters. + ### Zero-shot Prompt-based Setting See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). \ No newline at end of file diff --git a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh index cfd8964..855cde9 100644 --- a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh +++ b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh @@ -30,7 +30,7 @@ data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${da data_tok_dir=${data_dir}/lng_tok MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" -XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full # output directory LR=1e-5 # language adapters checkpoint folder diff --git a/scripts/eval_xnli/train_xnli_en.sh b/scripts/eval_xnli/train_xnli_zero_shot.sh similarity index 100% rename from scripts/eval_xnli/train_xnli_en.sh rename to scripts/eval_xnli/train_xnli_zero_shot.sh From ab01bce3a5cf3b0270b6c1c61c3f4fe4de5ac568 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 30 May 2022 21:37:45 -0400 Subject: [PATCH 081/142] update --- scripts/README.md | 6 + .../compute_retrieval_acc.sh | 22 + .../compute_retrieval_acc_bs.sh | 10 + .../eval_sentence_retrieval.py | 222 +++++++ scripts/archive/madx_exp/madx_lngembft_clm.py | 617 +++++++++++++++++ .../madx_exp/madxlastlayer_lngembft_clm.py | 618 ++++++++++++++++++ .../archive/madx_exp/run_clm_madx_lngemb.sh | 68 ++ scripts/archive/xnli/README.md | 80 +++ scripts/archive/xnli/archive_xnli.py | 222 +++++++ scripts/archive/xnli/xnli_v2.py | 213 ++++++ scripts/eval/README.md | 45 ++ scripts/eval/adapters_xnli_de.py | 229 +++++++ scripts/eval/adapters_xnli_de_vn.py | 229 +++++++ scripts/eval/run_eval_xnli_zero_shot.sh | 67 ++ scripts/eval/train_xnli_zero_shot.sh | 66 ++ scripts/lang_adapt/README.md | 3 + scripts/lang_adapt/madx_run_clm.py | 37 +- scripts/lang_adapt/run_clm_emb.sh | 34 +- scripts/lang_adapt/tokenized4clm_sampled.py | 27 +- 19 files changed, 2782 insertions(+), 33 deletions(-) create mode 100644 scripts/README.md create mode 100644 scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc.sh create mode 100644 scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh create mode 100644 scripts/archive/exp_sentence_retrievale_eval/eval_sentence_retrieval.py create mode 100644 scripts/archive/madx_exp/madx_lngembft_clm.py create mode 100644 scripts/archive/madx_exp/madxlastlayer_lngembft_clm.py create mode 100644 scripts/archive/madx_exp/run_clm_madx_lngemb.sh create mode 100644 scripts/archive/xnli/README.md create mode 100644 scripts/archive/xnli/archive_xnli.py create mode 100644 scripts/archive/xnli/xnli_v2.py create mode 100644 scripts/eval/README.md create mode 100644 scripts/eval/adapters_xnli_de.py create mode 100644 scripts/eval/adapters_xnli_de_vn.py create mode 100644 scripts/eval/run_eval_xnli_zero_shot.sh create mode 100644 scripts/eval/train_xnli_zero_shot.sh diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..2955ab5 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,6 @@ +### README + +This folder contains everything we need for running BigScience language adaptation experiments. + +Google Doc: [BigScience - Extending BLOOM to New Languages](https://docs.google.com/document/d/1OEJq2max5kLPF4mnnb9nyoodqR_z_UVQlw4tVx9TvTc/edit#heading=h.kk1966kbedef) + diff --git a/scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc.sh b/scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc.sh new file mode 100644 index 0000000..a0afcd8 --- /dev/null +++ b/scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH -p gpu +#SBATCH --gres="gpu:1" +#SBATCH --ntasks=16 +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J eval_retrieval_acc + +# Specify an output file +#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/eval_retrieval_acc-%j.out +#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/eval_retrieval_acc-%j.err + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com + + +model=$1 +dataset=$2 +outdir=retrieval_acc_${model}-${dataset} +mkdir $outdir +python eval_sentence_retrieval.py $outdir --pretrained_model $model --tokenizer $model --dataset $dataset diff --git a/scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh b/scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh new file mode 100644 index 0000000..5c7efc2 --- /dev/null +++ b/scripts/archive/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh @@ -0,0 +1,10 @@ +for model in tr5b-1B3-multilingual-alpha-checkpoints; do + for ch in 12000 55500 99000 100500 117000 118500; do + mname=${model}/ch${ch} + for dataset in flores ted_multi; do + outdir=retrieval_acc_${model}-${dataset} + mkdir -p $outdir + sbatch compute_retrieval_acc.sh ${mname} ${dataset} + done + done +done diff --git a/scripts/archive/exp_sentence_retrievale_eval/eval_sentence_retrieval.py b/scripts/archive/exp_sentence_retrievale_eval/eval_sentence_retrieval.py new file mode 100644 index 0000000..3fdf4e3 --- /dev/null +++ b/scripts/archive/exp_sentence_retrievale_eval/eval_sentence_retrieval.py @@ -0,0 +1,222 @@ +import logging +import argparse +import os +from datasets import load_dataset +from collections import namedtuple +import torch +import numpy as np +from transformers import BertTokenizer, BertModel +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import os.path +import sys +from loguru import logger +import random +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--pretrained_model", default="bert-base-multilingual-cased") +parser.add_argument("--tokenizer", default="bert-base-multilingual-cased") +parser.add_argument("--dataset", default="ted_multi") +parser.add_argument("--device", default="cuda") +args = parser.parse_args() + +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) +ted_lngs = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] +flores_lng = ["amh", "bos", "cat", "eng", "spa", "fra", "hin", "ind", "jpn", "por", "swh", "vie", "urd"] +bs_languages = ["id", "eu", "vi", "zh", "ur", "es", "ca", "pt", "fr", "en", "hi", "ar", "bn"] +lngcode_map = {"am":"amh", "bn":"bos", "ca":"cat", "en":"eng", "es":"spa", "fr": "fra", "hi": "hin", "id": "ind", "ja": "jpn", "pt": "por", "ur":"urd", "vi":"vie" } + + +print("Arguments: ========") +print(args) + + +def load_dataset_(args): + if args.dataset == "ted_multi": + return load_dataset_ted(args) + if args.dataset == "flores": + return load_dataset_flores(args) + + +def load_dataset_flores_for_lng(args, lng): + dataset = load_dataset("gsarti/flores_101", lngcode_map[lng])['dev'] + return dataset + +def load_dataset_flores(args): + dataset = {} + for lng in bs_languages: + if lng in lngcode_map: + load_dataset_flores_for_lng(args, lng) + return dataset + +def load_dataset_ted(args): + dataset = load_dataset("ted_multi")['validation'] + return dataset + +def get_talks(dataset, nb_talks): + talk_names = [] + for t in dataset['talk_name']: + if len(talk_names) < nb_talks and not t in talk_names: + talk_names.append(t) + + + print([(t1, len([t for t in dataset['talk_name'] if t == t1])) for t1 in talk_names]) + return talk_names + +def load_model(args): + if "xlm" in args.pretrained_model or "bert" in args.pretrained_model: + model = AutoModelForMaskedLM.from_pretrained(args.pretrained_model) + else: + model = AutoModelForCausalLM.from_pretrained(args.pretrained_model) + model.config.output_hidden_states=True + return model.to(args.device) + +Sample = namedtuple( + "Sample", + ("id", "hidden_state") +) + +def load_from_file(fname): + return torch.load(fname) + + +def get_hidden_states(args, model): + if args.dataset == "ted_multi": + dataset = load_dataset_(args) + nb_talks = 2 + talks = get_talks(dataset, nb_talks) + + emb = get_hidden_states_for_talks(dataset, model, talks, args.pretrained_model) + + outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-talks-valid-{len(talks)}" + + elif args.dataset == "flores": + nb_samples = 200 + emb = get_hidden_states_for_flores(args, model, args.pretrained_model, nb_samples = nb_samples) + outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-flores-{nb_samples}" + + retrieval_acc = {} + nb_states = model.config.num_hidden_layers + fig, ax = plt.subplots(1, int(nb_states/step), figsize=(12*int(nb_states/step), 10)) + + + with open(f"{outname}.log", 'w') as fout: + for state in range(0, nb_states, step): + plot_retrieval_acc(state, emb, ax[int(state/step)], fout) + + fig.tight_layout() + plt.savefig(f'{outname}-heatmap.png') + + +def get_hidden_states_for_flores(args, model, mname, nb_samples=50): + emb = {} + hidden_state_size = model.config.num_hidden_layers + for lng in bs_languages: + if lng in lngcode_map: + fname = f"{args.output_dir}/flores-{lng}-{nb_samples}-{mname.replace('/','-')}.pt" + if os.path.isfile(fname): + emb[lng] = load_from_file(fname) + else: + dataset = load_dataset_flores_for_lng(args, lng) + emb[lng] = {} + for state in range(hidden_state_size): + emb[lng][state] = [] + for i, sid in enumerate(dataset['id'][:nb_samples]): + t = dataset['sentence'][i] + x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) + out = model(x) + for state in range(hidden_state_size): + hs = torch.mean(out.hidden_states[state][0][1:-1], dim=0).detach() + emb[lng][state].append(Sample(sid, hs)) + torch.save(emb[lng], fname) + return emb + + +def get_hidden_states_for_talks(dataset, model, talks, mname): + emb = {} + hidden_state_size = model.config.num_hidden_layers + fname = f"{args.output_dir}/ted_multi-{mname.replace('/','-')}-ted_multi-{len(talks)}.pt" + if os.path.isfile(fname): + emb = load_from_file(fname) + return emb + for sid, sample in enumerate(dataset): + if sample['talk_name'] in talks: + tsample = sample['translations'] + for i, lng in enumerate(tsample['language']): + if lng in bs_languages: + t = tsample['translation'][i] + x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) + if not lng in emb: + emb[lng] = {} + for state in range(hidden_state_size): + emb[lng][state] = [] + out = model(x) + for state in range(hidden_state_size): + hs = torch.mean(out.hidden_states[state][0], dim=0).detach() + emb[lng][state].append(Sample(sid, hs)) + torch.save(emb, fname) + return emb + + +def compute_sent_retrieval_acc(lng1, lng2, emb, state, out): + cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) + E1 = torch.stack([s[1] for s in emb[lng1][state]]) + E2 = torch.stack([s[1] for s in emb[lng2][state]]) + #cos_matrix = [[cos(E2[i],E1[j]) for i in range(E2.shape[0]) ] for j in range(E1.shape[0])] + match = 0 + intersection_ids = set([emb[lng1][state][i][0] for i in range(E1.shape[0])]).intersection( + set([emb[lng2][state][i][0] for i in range(E2.shape[0])]) + ) + if len(intersection_ids)>0: + random_acc = 1/len(intersection_ids) + for i in range(E1.shape[0]): + if emb[lng1][state][i][0] in intersection_ids: + cos_sim = [cos(E2[j], E1[i]) for j in range(E2.shape[0])] + best_match = torch.argmax(torch.stack(cos_sim)) + if emb[lng2][state][best_match][0] == emb[lng1][state][i][0]: + match +=1 + acc = match/len(intersection_ids) + out.write(f"{lng1}-{lng2} = {acc} (random {random_acc} )\n") + return acc, len(intersection_ids) + else: + return 0, 0 + +def plot_retrieval_acc(state, emb, ax, out): + cmap="RdYlBu" + mean_per_state = 0 + for lng1 in emb: + if not lng1 in retrieval_acc: + retrieval_acc[lng1] = {} + for lng2 in emb: + lng2_chance = 1.0/len(emb[lng2][0]) + #if not lng1 == lng2: + acc, random_acc = compute_sent_retrieval_acc(lng1, lng2, emb, state, out) + retrieval_acc[lng1][lng2] = acc + #retrieval_acc[lng1]["random"] = lng2_chance + mean_acc = np.mean([v for v in retrieval_acc[lng1].values()]) + out.write(f"ACC per {lng1}, layer {state} = {mean_acc} \n" ) + mean_per_state +=mean_acc + mean_per_state = mean_per_state/len(emb.keys()) + out.write(f"ACC overall, layer {state} = {mean_per_state}\n" ) + m_res = pd.DataFrame(retrieval_acc) + m_res.columns=emb.keys() + m_res.index=emb.keys()#[e for e in emb.keys()]+["random"] + ax.set_title(f"state {state}") + sns.heatmap(m_res, ax=ax, annot=False, vmin=0, vmax=1.0, center=0, cmap=cmap) + + + +lngs2consider = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] +samples = 10 +model = load_model(args) +retrieval_acc = {} +step=1 +get_hidden_states(args, model) diff --git a/scripts/archive/madx_exp/madx_lngembft_clm.py b/scripts/archive/madx_exp/madx_lngembft_clm.py new file mode 100644 index 0000000..45b7c35 --- /dev/null +++ b/scripts/archive/madx_exp/madx_lngembft_clm.py @@ -0,0 +1,617 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def load_tokenizer(model_args): + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + return tokenizer + + + +def load_data(data_args, model_args): + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + + if "validation" not in raw_datasets.keys(): + if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + elif data_args.max_eval_samples is not None : + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + else: + raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) + + raw_datasets['validation'] = raw_datasets['test'] + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + return raw_datasets + +def load_model(model_args, tokenizer): + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + #TODO: remap embedding parameters + #if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + model.resize_token_embeddings(len(tokenizer)) + return model + +def preprocess_data(training_args, data_args, model_args, tokenizer): + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") + + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info("Sanity check: loaded tokenized_datasets") + else: + raw_datasets = load_data(data_args, model_args) + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + + text_column_name = "text" if "text" in column_names else column_names[0] + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info("Sanity check: saved tokenized_datasets") + if "train" not in tokenized_datasets and training_args.do_train: + raise ValueError("--do_train requires a train dataset") + if "validation" not in tokenized_datasets and training_args.do_eval: + raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets + + +def get_lm_dataset(training_args, data_args, model_args, tokenizer): + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("Sanity check: loaded lm_datasets") + else: + + tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("Sanity check: saved lm_datasets") + return lm_datasets + +def add_adapters(adapter_args, data_args, model): + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + trainable_params = 0 + frozen_params = 0 + emb_params = 0 + for name, param in model.named_parameters(): + if not param.requires_grad: + if not "wte" in name and not "lm_head" in name: + print(f"🥶 Frozen layer '{name}'") + frozen_params +=param.numel() + else: + param.requires_grad = True + print(f"🚀 Trainable layer '{name}'") + emb_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + print(f"Total frozen parameters: {frozen_params}") + print(f"Total emb parameters: {emb_params}") + print(f"Total trainable parameters: {trainable_params}") + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + training_args.data_dir = f'{training_args.output_dir}/../' + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + pass + #raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + #) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + tokenizer = load_tokenizer(model_args) + model = load_model(model_args, tokenizer) + + add_adapters(adapter_args, data_args, model) + # Preprocessing the datasets. + lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) + if training_args.do_train: + train_dataset = lm_datasets["train"] + + if training_args.do_eval: + + eval_dataset = lm_datasets["validation"] + + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + +# if training_args.push_to_hub: +# trainer.push_to_hub(**kwargs) +# else: +# trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/scripts/archive/madx_exp/madxlastlayer_lngembft_clm.py b/scripts/archive/madx_exp/madxlastlayer_lngembft_clm.py new file mode 100644 index 0000000..7234cea --- /dev/null +++ b/scripts/archive/madx_exp/madxlastlayer_lngembft_clm.py @@ -0,0 +1,618 @@ +""" +Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import torch +import pathlib + +import datasets +from datasets import load_dataset + +import transformers +import transformers.adapters.composition as ac +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AdapterTrainer, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + MultiLingAdapterArguments, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.adapters.configuration import AdapterConfig +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.11.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def load_tokenizer(model_args): + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + return tokenizer + + + +def load_data(data_args, model_args): + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) + + if "validation" not in raw_datasets.keys(): + if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) + elif data_args.max_eval_samples is not None : + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) + else: + raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) + + raw_datasets['validation'] = raw_datasets['test'] + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + return raw_datasets + +def load_model(model_args, tokenizer): + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForCausalLM.from_config(config) + n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + #TODO: remap embedding parameters + #if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + + model.resize_token_embeddings(len(tokenizer)) + return model + +def preprocess_data(training_args, data_args, model_args, tokenizer): + with training_args.main_process_first(desc="dataset map tokenization"): + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") + + saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) + logger.info("Sanity check: loaded tokenized_datasets") + else: + raw_datasets = load_data(data_args, model_args) + # First we tokenize all the texts. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + else: + column_names = raw_datasets["validation"].column_names + + text_column_name = "text" if "text" in column_names else column_names[0] + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + torch.save(tokenized_datasets, saved_tokenized_datasets_fp) + logger.info("Sanity check: saved tokenized_datasets") + if "train" not in tokenized_datasets and training_args.do_train: + raise ValueError("--do_train requires a train dataset") + if "validation" not in tokenized_datasets and training_args.do_eval: + raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets + + +def get_lm_dataset(training_args, data_args, model_args, tokenizer): + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with training_args.main_process_first(desc="grouping texts together"): + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") + if not tokenizer.name_or_path == model_args.model_name_or_path: + saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + lm_datasets = torch.load(str(saved_lm_datasets_fp)) + logger.info("Sanity check: loaded lm_datasets") + else: + + tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + torch.save(lm_datasets, saved_lm_datasets_fp) + logger.info("Sanity check: saved lm_datasets") + return lm_datasets + +def add_adapters(adapter_args, data_args, model): + # Setup adapters + if adapter_args.train_adapter: + task_name = data_args.dataset_name or "clm" + task_name += f"_{adapter_args.language}" + # check if adapter already exists, otherwise add it + if task_name not in model.config.adapters: + # resolve the adapter config + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + leave_out = [i for i in range(0,23)] + ) + # load a pre-trained from Hub if specified + if adapter_args.load_adapter: + model.load_adapter( + adapter_args.load_adapter, + config=adapter_config, + load_as=task_name, + ) + # otherwise, add a fresh adapter + else: + model.add_adapter(task_name, config=adapter_config) + # optionally load a pre-trained language adapter + if adapter_args.load_lang_adapter: + # resolve the language adapter config + lang_adapter_config = AdapterConfig.load( + adapter_args.lang_adapter_config, + non_linearity=adapter_args.lang_adapter_non_linearity, + reduction_factor=adapter_args.lang_adapter_reduction_factor, + ) + # load the language adapter from Hub + lang_adapter_name = model.load_adapter( + adapter_args.load_lang_adapter, + config=lang_adapter_config, + load_as=adapter_args.language, + ) + else: + lang_adapter_name = None + # Freeze all model weights except of those of this adapter + model.train_adapter([task_name]) + # Set the adapters to be used in every forward pass + if lang_adapter_name: + model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) + else: + model.set_active_adapters(task_name) + else: + if adapter_args.load_adapter or adapter_args.load_lang_adapter: + raise ValueError( + "Adapters can only be loaded in adapters training mode." + "Use --train_adapter to enable adapter training" + ) + trainable_params = 0 + frozen_params = 0 + emb_params = 0 + for name, param in model.named_parameters(): + if not param.requires_grad: + if not "wte" in name and not "lm_head" in name: + print(f"🥶 Frozen layer '{name}'") + frozen_params +=param.numel() + else: + param.requires_grad = True + print(f"🚀 Trainable layer '{name}'") + emb_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + print(f"Total frozen parameters: {frozen_params}") + print(f"Total emb parameters: {emb_params}") + print(f"Total trainable parameters: {trainable_params}") + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, adapter_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + training_args.data_dir = f'{training_args.output_dir}/../' + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"model_args {model_args}") + logger.info(f"data_args {data_args}") + logger.info(f"Training/evaluation parameters {training_args}") + logger.info(f"Adapter parameters {adapter_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + pass + #raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + #) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + tokenizer = load_tokenizer(model_args) + model = load_model(model_args, tokenizer) + + add_adapters(adapter_args, data_args, model) + # Preprocessing the datasets. + lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) + if training_args.do_train: + train_dataset = lm_datasets["train"] + + if training_args.do_eval: + + eval_dataset = lm_datasets["validation"] + + + # Initialize our Trainer + trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer = trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + logger.info(model) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + +# if training_args.push_to_hub: +# trainer.push_to_hub(**kwargs) +# else: +# trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/scripts/archive/madx_exp/run_clm_madx_lngemb.sh b/scripts/archive/madx_exp/run_clm_madx_lngemb.sh new file mode 100644 index 0000000..4e1315b --- /dev/null +++ b/scripts/archive/madx_exp/run_clm_madx_lngemb.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=16 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J exp-009-run_clm_de_madx + +# Specify an output file +#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_de_madx-%j.out +#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_de_madx-%j.err + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com + +# Set up the environment by loading modules +source /tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/env/bin/activate +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience/ + +data_sample=100000 +ch=$1 +lng=$2 +dataset=oscar +adapter_config="pfeiffer+inv" +adapter_reduction_factor=48 +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +tokenizer_dir="${FP_BIGS}/tokenizers/bigscience-1.3B-${lng}-tokenizer" +cache_dir="${FP_BIGS}/data/${dataset}_${lng}" +data_dir="${FP_BIGS}/exp-009/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}" +data_tok_dir="${FP_BIGS}/exp-009/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}/lng_tok" +output_dir="${data_dir}/withlngembft-lmhead-${adapter_config}-${adapter_reduction_factor}" +logging_dir="${FP_BIGS}/logs/exp-009/madx-bs1b3-multi-ch${ch}-${dataset}-${lng}-sample${data_sample}-withlngembft-lmhead-${adapter_config}-${adapter_reduction_factor}" + + +python $FP_BIGS/multilingual-modeling/scripts/madx_exp/madx_lngembft_clm.py \ + --fp16 \ + --model_name_or_path ${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name \ + --tokenizer_name ${tokenizer_dir} \ + --dataset_name ${dataset} \ + --cache_dir $cache_dir \ + --dataset_config_name unshuffled_deduplicated_${lng} \ + --logging_dir ${logging_dir} \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir ${output_dir} \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 2 \ + --eval_steps 5000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --train_adapter \ + --adapter_reduction_factor ${adapter_reduction_factor} \ + --language ${lng} \ + --num_train_epochs 6.0 \ + --adapter_config ${adapter_config} \ + --max_train_samples ${data_sample} diff --git a/scripts/archive/xnli/README.md b/scripts/archive/xnli/README.md new file mode 100644 index 0000000..f368439 --- /dev/null +++ b/scripts/archive/xnli/README.md @@ -0,0 +1,80 @@ +# XNLI Evaluation + +Use `xnli_v2.py` to run the evaluation on XNLI. + +### With Language Adapters +``` +LANG="th" +CACHE_DIR="/users/zyong2/data/zyong2/huggingface/" +lr=5e-5 + +# Original BigScience model and language-specific tokenizer +MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-ckpt118500" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_24000" + +# saved language adapters +MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/oscar_th" + +# saved embedding layers +WTE="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/transformer.wte.weight.pt" +WPE="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/transformer.wpe.weight.pt" + +# output directory +OUTPUT_DIR="$FP_BIGS/data/processed/021/xnli_th_adpt_100000samples" + +mkdir -p $OUTPUT_DIR + +# remove --zero_shot for supervised finetuning setting; otherwise, it will be cross-lingual finetuning setting. +# use --use_partial_data to test the code + +python xnli_v2.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $lr \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--wte $WTE \ +--wpe $WPE \ +--zero_shot +``` + +### Embedding only approach (No Language Adapters) +``` +LANG="th" +CACHE_DIR="/users/zyong2/data/zyong2/huggingface/" +lr=5e-5 + +# Saved finetuned model and language-specific tokenizer +MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_emb_100000samples" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_24000" + +# output directory +OUTPUT_DIR="$FP_BIGS/data/processed/021/xnli_th_adpt_100000samples" + +mkdir -p $OUTPUT_DIR + +# remove --zero_shot for supervised finetuning setting; otherwise, it will be cross-lingual finetuning setting. +# use --use_partial_data to test the code + +python xnli_v2.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $lr \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--zero_shot \ +--use_partial_data +``` diff --git a/scripts/archive/xnli/archive_xnli.py b/scripts/archive/xnli/archive_xnli.py new file mode 100644 index 0000000..24aed27 --- /dev/null +++ b/scripts/archive/xnli/archive_xnli.py @@ -0,0 +1,222 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +parser.add_argument("--madx_lang_adapter") +parser.add_argument("--adapter_lang_name", required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + # for adapters, when we load with GPT2ForSequenceClassification, the embeddings are the original model + if args.zero_shot and not inference: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + else: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + # this part is to replace the embedding layer + if args.madx_lang_adapter and (not args.zero_shot or (args.zero_shot and inference)): + # if not zero shot, that means that we need to replace the embedding layers during training + # we also need to replace embedding layers during inference + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + + # change the embedding layer of the original big science model + # by loading the adapters (which has saved lm_head) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + + model.resize_token_embeddings(len(tokenizer)) + model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] + + if not inference: + if not args.zero_shot and args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv", + load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + print("🔥 ==================== Training: ==================== 🔥") + print(model) + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + else: + print("🔥 ==================== Inference: ==================== 🔥") + if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + adapter_name = model.load_adapter(args.madx_lang_adapter) + model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + else: + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + + # for TGT -> TGT supervised finetuning setting, change adapter_name + adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/archive/xnli/xnli_v2.py b/scripts/archive/xnli/xnli_v2.py new file mode 100644 index 0000000..1887e83 --- /dev/null +++ b/scripts/archive/xnli/xnli_v2.py @@ -0,0 +1,213 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--pretrained_model") +parser.add_argument("--original_model") +parser.add_argument("--wte") +parser.add_argument("--wpe") +parser.add_argument("--tokenizer") +parser.add_argument("--madx_lang_adapter") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--zero_shot", default=False, action="store_true") + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, pretrained_model is the original bigsciece model + args.original_model = args.pretrained_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.zero_shot: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.zero_shot: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.zero_shot: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) +else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) +small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + # for adapters, when we load with GPT2ForSequenceClassification, the embeddings are the original model + if args.zero_shot and not inference: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + else: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, + num_labels=3, + pad_token_id=tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + # this part is to replace the embedding layer + if not args.zero_shot or (args.zero_shot and inference): + if args.wpe: + wpe = torch.load(args.wpe) + model._modules['transformer']._modules['wpe'].weight.data = wpe + logger.info(f"Loaded wpe from {args.wpe}") + if args.wte: + wte = torch.load(args.wte) + model._modules['transformer']._modules['wte'].weight.data = wte + logger.info(f"Loaded wte from {args.wte}") + + if not inference: + if not args.zero_shot and args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter, + config="pfeiffer+inv") + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + print("🔥 ==================== Training: ==================== 🔥") + print(model) + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + else: + print("🔥 ==================== Inference: ==================== 🔥") + assert args.pretrained_adapters_dir + if args.madx_lang_adapter: + adapter_name = model.load_adapter(args.madx_lang_adapter) + model.set_active_adapters(adapter_name) + + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained task adapters from {args.pretrained_adapters_dir}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + result = trainer.evaluate() + + print("Evaluate on Test:", result) \ No newline at end of file diff --git a/scripts/eval/README.md b/scripts/eval/README.md new file mode 100644 index 0000000..f7c1195 --- /dev/null +++ b/scripts/eval/README.md @@ -0,0 +1,45 @@ +# XNLI (Cross-Lingual and Supervised Setting) + +Current scripts are for XNLI (German). + +``` +OUTPUT_DIR=... # where you want to save checkpoints at +LANG="de" +CACHE_DIR=... # cache dir for saving/loading HF models and XNLI datasets. +LR=1e-5 +MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME=".../oscar_de" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +mkdir -p $OUTPUT_DIR +python adapters_xnli_de.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies $FT_STRATEGIES \ +--zero_shot +``` + +Remove `--zero_shot` for supervised finetuning setting. + +Notes: +- `adapters_xnli_de_vn.py` is Vassilina's forked of `adapters_xnli_de.py`. +- `train_xnli_zero_shot.sh` is the batch script for XNLI training, and `run_eval_xnli_zero_shot.sh` is for evaluating trained XNLI task adapters. + +### Zero-shot Prompt-based Setting + +See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). \ No newline at end of file diff --git a/scripts/eval/adapters_xnli_de.py b/scripts/eval/adapters_xnli_de.py new file mode 100644 index 0000000..3e29ddd --- /dev/null +++ b/scripts/eval/adapters_xnli_de.py @@ -0,0 +1,229 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--adapted_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--cross_lingual", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.cross_lingual: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.cross_lingual: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.do_train: + if args.cross_lingual: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + + + small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) + small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) + logger.info(full_train_dataset[0]) + logger.info(full_train_dataset[100]) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + if not args.original_model == args.adapted_model and not args.cross_lingual: + wte = torch.load(f'{args.adapted_model}/embedding.pt') + wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') + + model = GPT2ForSequenceClassification.from_pretrained(args.original_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if inference or not args.cross_lingual: + # need to load embedding/adapters from the model adapted to the new language + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.adapted_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe + if args.madx_lang_adapter: + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) + + if not inference: + #if not args.cross_lingual: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) diff --git a/scripts/eval/adapters_xnli_de_vn.py b/scripts/eval/adapters_xnli_de_vn.py new file mode 100644 index 0000000..3e29ddd --- /dev/null +++ b/scripts/eval/adapters_xnli_de_vn.py @@ -0,0 +1,229 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, AdapterTrainer +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="de") +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--adapted_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--cross_lingual", default=False, action="store_true") + +finetune_strategies = ["whole", "lang_adapters", "task_adapters"] +parser.add_argument("--madx_lang_adapter") +#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? +parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) + +args = parser.parse_args() +if args.do_eval_after_train: + args.do_predict = True + +if args.original_model is None: + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model + +print("Arguments: ========") +print(args) + + +# load dataset +if args.cross_lingual: + print("0️⃣ 0-Shot") + # 0-shot: use english as train and validation + xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + assert args.lang != "en" + train_dataset = xnli_en_dataset['train'] + val_dataset = xnli_en_dataset['validation'] + test_dataset = xnli_dataset['test'] +else: + print("👀 Supervised Training") + xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) + + train_dataset = xnli_dataset['train'] + val_dataset = xnli_dataset['validation'] + test_dataset = xnli_dataset['test'] + + +# load tokenizer +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) +tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] +if args.cross_lingual: + en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer + en_tokenizer.pad_token = en_tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + +logger.info("Tokenizing the dataset...") +if args.do_train: + if args.cross_lingual: + full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) + full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + full_train_dataset = train_dataset.map(tokenize_function, batched=False) + full_val_dataset = val_dataset.map(tokenize_function, batched=False) + + + small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) + small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) + logger.info(full_train_dataset[0]) + logger.info(full_train_dataset[100]) + +full_test_dataset = test_dataset.map(tokenize_function, batched=False) +small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) + +from datasets import load_metric +metric = load_metric("xnli") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +training_args = TrainingArguments( + args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else 10, + num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.per_device_train_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, +) + +def load_model(args, inference=False): + # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one + # even when we call load_adapter + if not args.original_model == args.adapted_model and not args.cross_lingual: + wte = torch.load(f'{args.adapted_model}/embedding.pt') + wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') + + model = GPT2ForSequenceClassification.from_pretrained(args.original_model, + num_labels=3, + pad_token_id=en_tokenizer.pad_token_id, + cache_dir=args.cache_dir) + + if inference or not args.cross_lingual: + # need to load embedding/adapters from the model adapted to the new language + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.adapted_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe + if args.madx_lang_adapter: + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) + + if not inference: + #if not args.cross_lingual: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter("xnli-task-adapter") + model.train_adapter("xnli-task-adapter") + + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) + print(model) + + return model + +if args.do_train: + logger.info("Start Training") + model = load_model(args) + trainer = AdapterTrainer( + model=model, + args=training_args, + train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, + eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, + compute_metrics=compute_metrics + ) + + trainer.train() + +if args.do_predict: + if args.do_eval_after_train: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith('checkpoint-') + ], key=lambda x: int(x[len('checkpoint-'):]))) + if args.madx_lang_adapter: + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + trainer = AdapterTrainer( + model=model, + args=training_args, + eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, + compute_metrics=compute_metrics + ) + + print("Evaluate on Test:", trainer.evaluate()) diff --git a/scripts/eval/run_eval_xnli_zero_shot.sh b/scripts/eval/run_eval_xnli_zero_shot.sh new file mode 100644 index 0000000..855cde9 --- /dev/null +++ b/scripts/eval/run_eval_xnli_zero_shot.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#SBATCH -p gpu +#SBATCH --gres="gpu:1" +#SBATCH --mem=100g + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +# XNLI (Cross-Lingual and Supervised Setting) + +LANG=$1 +data_sample=$2 +vocabsize=$3 +adapter_reduction_factor=$4 + +ch=118500 + + +adapter_config="pfeiffer+inv" +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name +TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +CACHE_DIR="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" +data_tok_dir=${data_dir}/lng_tok + +MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full # output directory +LR=1e-5 + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_${LANG}" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +outdir=$MODEL_DIR/xnli_eval_zero_shot +# evaluate zero-shot training +python adapters_xnli_de_vn.py \ +$XNLI_ZH_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_DIR \ +--original_model $ORIGINAL_MODEL \ +--tokenizer $TOKENIZER_DIR \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies "task_adapters" \ +--zero_shot &> $XNLI_ZH_DIR/$( basename $data_dir )-$( basename $MODEL_DIR )_eval.log + + + + +#Remove `--zero_shot` for supervised finetuning setting. + +### Zero-shot Prompt-based Setting + +#See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). diff --git a/scripts/eval/train_xnli_zero_shot.sh b/scripts/eval/train_xnli_zero_shot.sh new file mode 100644 index 0000000..8a9445c --- /dev/null +++ b/scripts/eval/train_xnli_zero_shot.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH -p gpu +#SBATCH --gres="gpu:1" + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=100g + +# Specify a job name: +#SBATCH -J run_clm_madx + +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com +#SBATCH --constraint="gpu_v100&gpu_32g" + +# XNLI (Cross-Lingual and Supervised Setting) + +FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience +# Set up the environment by loading modules +source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate + +LANG=$1 +data_sample=$2 +vocabsize=$3 +adapter_reduction_factor=$4 + +ch=118500 + + +adapter_config="pfeiffer+inv" +model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" +ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name +TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k +CACHE_DIR="${FP_BIGS}/data/" +data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" +data_tok_dir=${data_dir}/lng_tok + +MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" +OUTPUT_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full +LR=1e-5 + +# language adapters checkpoint folder +MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_de" + +# we finetune task adapters for XNLI +FT_STRATEGIES="task_adapters" + +mkdir -p $OUTPUT_DIR +python adapters_xnli_de_vn.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--pretrained_model $MODEL_DIR \ +--original_model $ORIGINAL_MODEL \ +--tokenizer $TOKENIZER_DIR \ +--do_train \ +--do_eval_after_train \ +--madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ +--finetune_strategies "task_adapters" \ +--zero_shot &> $OUTPUT_DIR/train.log + diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index f2ed6c2..35efb6a 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -3,6 +3,7 @@ ### Tokenizer and Tokenization of Dataset Run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. - `lang`: language name (e.g., "de", "th") +- `model`: model that uses this tokenizer (e.g., "gpt2", "bigscience/bloom-1b3`) - `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `{lang}_oscar_tokenizer_{vocab_size}` - `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. - `vocab_size`: vocab size of the tokenizer @@ -15,6 +16,8 @@ Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dat - `vocab_size`: vocab size of the tokenizer - `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) +--- + ### Language Adaptation (6 Combinations) - use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. - use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 5e030ac..ba65010 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -195,11 +195,12 @@ def load_tokenizer(model_args): "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } - if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + print(f"✅ load tokenizer from: {model_args.tokenizer_name}") elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + print(f"✅ load tokenizer from: {model_args.model_name_or_path}") else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." @@ -244,9 +245,9 @@ def load_data(data_args, model_args): if "validation" not in raw_datasets.keys(): if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: - raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples*2, test_size = data_args.max_eval_samples*2) + raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) elif data_args.max_eval_samples is not None : - raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples*2) + raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) else: raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) raw_datasets['validation'] = raw_datasets['test'] @@ -261,11 +262,15 @@ def load_data(data_args, model_args): if data_args.max_train_samples is not None and len(raw_datasets['train']) > data_args.max_train_samples: # FIXME: currently assume the loaded checkpoint is trained with the first data_args.max_train_samples number of samples #raw_datasets["train"] = raw_datasets["train"].filter(lambda example, indice: indice < data_args.max_train_samples, with_indices=True) + print(raw_datasets["train"]) raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) + print(raw_datasets["train"]) if data_args.max_eval_samples is not None and len(raw_datasets['validation']) > data_args.max_eval_samples: raw_datasets["validation"] = raw_datasets["validation"].select(range(data_args.max_eval_samples)) + print("✅ Loaded Raw Dataset:") + print(raw_datasets) return raw_datasets def load_model(model_args, tokenizer): @@ -293,10 +298,13 @@ def load_model(model_args, tokenizer): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + print(f"✅ load model from: {model_args.model_name_or_path}") else: model = AutoModelForCausalLM.from_config(config) n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + print(f"✅ load model from config: ") + print(config) #TODO: remap embedding parameters @@ -311,7 +319,7 @@ def preprocess_data(training_args, data_args, model_args, tokenizer): else: raw_datasets = load_data(data_args, model_args) assert len(raw_datasets['train']) == data_args.max_train_samples - logger.info(f"🧠 Sanity check: loaded raw datasets have {data_args.max_train_samples} samples") + print(f"✅ Sanity check: loaded raw datasets have {data_args.max_train_samples} samples") # First we tokenize all the texts. if training_args.do_train: @@ -470,6 +478,7 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): "Use --train_adapter to enable adapter training" ) + print(f"✅ Use Embedding Strategy: {model_args.embedding_strategies}") if model_args.embedding_strategies == "overlap-replace": if not tokenizer.name_or_path == model_args.model_name_or_path: orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) @@ -478,7 +487,13 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): model.delete_embeddings('default') model.tie_weights() elif model_args.embedding_strategies == "replace": + # print(model.transformer.wte.weight) + # print(model.transformer.wpe.weight) model.resize_token_embeddings(len(tokenizer)) + model.transformer.wte.apply(model._init_weights) # reinitialize token embedding weights + # print(model.transformer.wte.weight) + # print(model.transformer.wpe.weight) + trainable_params = 0 frozen_params = 0 emb_params = 0 @@ -486,6 +501,8 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): if "wte" in name or "wpe" in name: param.requires_grad = True emb_params += param.numel() + elif model_args.lang_adapt_strategies == "emb": + param.requires_grad = False if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") @@ -516,7 +533,8 @@ def main(): ) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() - training_args.data_dir = f'{training_args.output_dir}' + + training_args.data_dir = f'{training_args.output_dir}' assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') @@ -567,6 +585,7 @@ def main(): tokenizer = load_tokenizer(model_args) model = load_model(model_args, tokenizer) modify_model(adapter_args, data_args, model_args, tokenizer, model) + # Preprocessing the datasets. lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) if training_args.do_train: @@ -574,6 +593,7 @@ def main(): if training_args.do_eval: eval_dataset = lm_datasets["validation"] + # Initialize our Trainer trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer trainer = trainer_class( @@ -601,9 +621,10 @@ def main(): trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) # save embedding and positional embedding (which is not saved by trainer) - trainer.model.save_embeddings(trainer.args.output_dir, 'lng_emb') - torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/positional_embedding.pt') - + embedding_name = "lng_emb" if model_args.embedding_strategies == "overlap-replace" else "default" + trainer.model.save_embeddings(trainer.args.output_dir, embedding_name) + torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check + torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') metrics = train_result.metrics max_train_samples = ( diff --git a/scripts/lang_adapt/run_clm_emb.sh b/scripts/lang_adapt/run_clm_emb.sh index cb397ff..ce2f629 100644 --- a/scripts/lang_adapt/run_clm_emb.sh +++ b/scripts/lang_adapt/run_clm_emb.sh @@ -5,7 +5,7 @@ # Ask for the GPU partition and 1 GPU #SBATCH --partition=gpu-he --gres=gpu:1 -#SBATCH --array=100,200,500 +#SBATCH --array=1 # Default resources are 1 core with 2.8GB of memory. #SBATCH --ntasks=4 @@ -31,24 +31,31 @@ source $FP_BIGS/env_lang_adapter/bin/activate # axis -LANG="th" -MAX_TRAIN_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) -BIGS_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" +LANG="my" +DATA_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) +DATA_SAMPLES=100 +VOCAB_SIZE=5000 +CH=118500 +BIGS_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-ckpt${CH}" +ADPT_STRATEGY="emb" +EMBD_SRATEGY="replace" - -tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_full" +tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_oscar_100000_tokenizer_${VOCAB_SIZE}_${EMBD_SRATEGY}" cache_dir="/users/zyong2/data/zyong2/huggingface/" -output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_emb_${MAX_TRAIN_SAMPLES}samples" -logging_dir="/users/zyong2/data/zyong2/bigscience/data/reports/020/${LANG}_emb_${MAX_TRAIN_SAMPLES}samples" +output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_emb_${DATA_SAMPLES}samples" +logging_dir="/users/zyong2/data/zyong2/bigscience/reports/020/${LANG}_emb_${DATA_SAMPLES}samples" + mkdir -p $output_dir mkdir -p $logging_dir -python $FP_BIGS/scripts/exp-020/madx_run_clm.py \ +python /users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \ + --seed 0 \ + --fp16 \ --model_name_or_path $BIGS_MODEL \ --tokenizer_name $tokenizer_dir \ --dataset_name oscar \ --cache_dir $cache_dir \ - --dataset_config_name "unshuffled_deduplicated_$LANG" \ + --dataset_config_name "unshuffled_deduplicated_${LANG}" \ --logging_dir $logging_dir \ --report_to "tensorboard" \ --learning_rate 0.001 \ @@ -66,7 +73,8 @@ python $FP_BIGS/scripts/exp-020/madx_run_clm.py \ --max_eval_samples 5000 \ --save_steps 25000 \ --save_strategy "steps" \ - --max_train_samples $MAX_TRAIN_SAMPLES \ + --max_train_samples $DATA_SAMPLES \ --max_steps 50000 \ - --lang_adapt_strategies "emb" \ - --embedding_strategies "replace" \ No newline at end of file + --lang_adapt_strategies $ADPT_STRATEGY \ + --embedding_strategies $EMBD_SRATEGY \ + --load_best_model_at_end \ No newline at end of file diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 71e1fea..ccafa21 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -26,11 +26,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--lang', type=str, required=True) +parser.add_argument('--model', type=str, required=True) parser.add_argument('--tokenizer_dir', type=str, required=True) parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) parser.add_argument('--vocab_size', default=130_000, type=int) parser.add_argument('--extend_vocab', action='store_true') -parser.add_argument('--replace_with_overlap', action='store_true') # this is not working as expected +# parser.add_argument('--replace_with_overlap', action='store_true') parser.add_argument('--sample_size', default=None, type=int) args = parser.parse_args() @@ -73,20 +74,22 @@ def batch_iterator(): print(f"Overlap with previous vocab: {args.vocab_size - added}") tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") -elif args.replace_with_overlap: - # This setting is not really working properly: we need to save the new_tokenizer, but add somehow token that can be used at inference which I don't know how to do (so that it is also get used at tokenization step properly - tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/', unk_token="") - assert tokenizer.is_fast - new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) - print("✅ Trained tokenizer with len ", len(new_tokenizer)) - new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") - print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") +# elif args.replace_with_overlap: +# # +# tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/', unk_token="") + +# assert tokenizer.is_fast +# new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) +# print("✅ Trained tokenizer with len ", len(new_tokenizer)) +# new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") +# print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") + else: - tokenizer = AutoTokenizer.from_pretrained('gpt2') + tokenizer = AutoTokenizer.from_pretrained(args.model) assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("Unique toks, ", len(unique_toks)) print("✅ Trained tokenizer with len ", len(new_tokenizer)) - new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_scratch") - print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_scratch") + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_replace") + print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_replace") From f98bb12c9adb7e69a487474857510abc43c3dcbb Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 1 Jun 2022 06:00:20 -0400 Subject: [PATCH 082/142] update --- README.md | 16 - scripts/eval_xnli/README.md | 45 -- scripts/eval_xnli/adapters_xnli_de.py | 229 ------- scripts/eval_xnli/adapters_xnli_de_vn.py | 229 ------- scripts/eval_xnli/run_eval_xnli_zero_shot.sh | 67 -- scripts/eval_xnli/train_xnli_zero_shot.sh | 66 -- .../compute_retrieval_acc.sh | 22 - .../compute_retrieval_acc_bs.sh | 10 - .../eval_sentence_retrieval.py | 222 ------- scripts/lang_adapt/madx_run_clm.py | 5 - scripts/madx_exp/madx_lngembft_clm.py | 617 ----------------- .../madx_exp/madxlastlayer_lngembft_clm.py | 618 ------------------ scripts/madx_exp/run_clm_madx_lngemb.sh | 68 -- scripts/xnli/README.md | 80 --- scripts/xnli/archive_xnli.py | 222 ------- scripts/xnli/xnli_v2.py | 213 ------ 16 files changed, 2729 deletions(-) delete mode 100644 scripts/eval_xnli/README.md delete mode 100644 scripts/eval_xnli/adapters_xnli_de.py delete mode 100644 scripts/eval_xnli/adapters_xnli_de_vn.py delete mode 100644 scripts/eval_xnli/run_eval_xnli_zero_shot.sh delete mode 100644 scripts/eval_xnli/train_xnli_zero_shot.sh delete mode 100644 scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh delete mode 100644 scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh delete mode 100644 scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py delete mode 100644 scripts/madx_exp/madx_lngembft_clm.py delete mode 100644 scripts/madx_exp/madxlastlayer_lngembft_clm.py delete mode 100644 scripts/madx_exp/run_clm_madx_lngemb.sh delete mode 100644 scripts/xnli/README.md delete mode 100644 scripts/xnli/archive_xnli.py delete mode 100644 scripts/xnli/xnli_v2.py diff --git a/README.md b/README.md index 11f32dd..e69de29 100644 --- a/README.md +++ b/README.md @@ -1,16 +0,0 @@ -### Previous Experiments -- `exp-001`: train gpt-2's tokenizer and finetune gpt-2's embedding layers `wte` and `wpe` on HF's OSCAR `unshuffled_deduplicated_fr` and `unshuffled_dudplicated_kr`. -- `exp-002`: evaluate gpt-2 on FLUE's tasks (CLS, XNLI, PAWS) -- `exp-003`: TODO: evaluate on multiatis -- `exp-004`: Does the embedding layer learn anything useful? Take a dataset in English for PAWS-X, finetune GPT-2 on this dataset, evaluate it on English test set T_e. Then, take the same test-set T_e translated in French (T_f), take GPT-2 parameters fine-tuned for the task X, replace English embeddings with French embeddings and evaluate thus obtained model on French test set. - -# Experiment folders below after Conversation with Vassilina, Hady, Iz, and Maruf [Link](https://huggingface.slack.com/archives/C020G6A9KHQ/p1637023149074800) -- `exp-005`: cleaned from `exp-001` for finetuning GPT-2 embedding layers for DE and KO on Oscar. -- `exp-006`: run zero-shot and finetuned evaluation setting for XNLI ✅, PAWS ❌, and XQuAD ❌. (❌ means not done. ✅ means done.) -- `exp-007`: apply MAD-X adapter method. [Paper link](https://arxiv.org/abs/2005.00052) -- `exp-008`: from exp-006, but using mBERT on the zero-shot and finetuning setting. - - -# Carbon Tracking -Do not forget to log your experiments [in this spreadsheet](https://docs.google.com/spreadsheets/d/1Mk8mYCOF_WxMv-Uv5ThkFs5Ak5B9s9EnRUh1CpykEJ0/edit#gid=0) - diff --git a/scripts/eval_xnli/README.md b/scripts/eval_xnli/README.md deleted file mode 100644 index f7c1195..0000000 --- a/scripts/eval_xnli/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# XNLI (Cross-Lingual and Supervised Setting) - -Current scripts are for XNLI (German). - -``` -OUTPUT_DIR=... # where you want to save checkpoints at -LANG="de" -CACHE_DIR=... # cache dir for saving/loading HF models and XNLI datasets. -LR=1e-5 -MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints" -TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/011/oscar-de-tokenizer" - -# language adapters checkpoint folder -MADX_LANG_ADAPTER_NAME=".../oscar_de" - -# we finetune task adapters for XNLI -FT_STRATEGIES="task_adapters" - -mkdir -p $OUTPUT_DIR -python adapters_xnli_de.py \ -$OUTPUT_DIR \ ---lang $LANG \ ---cache_dir $CACHE_DIR \ ---num_train_epochs 2 \ ---learning_rate $LR \ ---per_device_train_batch_size 8 \ ---gradient_accumulation_steps 4 \ ---pretrained_model $MODEL_NAME \ ---tokenizer $TOKENIZER_NAME \ ---do_train \ ---do_eval_after_train \ ---madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ ---finetune_strategies $FT_STRATEGIES \ ---zero_shot -``` - -Remove `--zero_shot` for supervised finetuning setting. - -Notes: -- `adapters_xnli_de_vn.py` is Vassilina's forked of `adapters_xnli_de.py`. -- `train_xnli_zero_shot.sh` is the batch script for XNLI training, and `run_eval_xnli_zero_shot.sh` is for evaluating trained XNLI task adapters. - -### Zero-shot Prompt-based Setting - -See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). \ No newline at end of file diff --git a/scripts/eval_xnli/adapters_xnli_de.py b/scripts/eval_xnli/adapters_xnli_de.py deleted file mode 100644 index 3e29ddd..0000000 --- a/scripts/eval_xnli/adapters_xnli_de.py +++ /dev/null @@ -1,229 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer, AdapterTrainer -from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--adapted_model") -parser.add_argument("--original_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--cross_lingual", default=False, action="store_true") - -finetune_strategies = ["whole", "lang_adapters", "task_adapters"] -parser.add_argument("--madx_lang_adapter") -#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? -parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) - -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -if args.original_model is None: - # here: because the wpe is not saved, adapted_model is the original bigsciece model - args.original_model = args.adapted_model - -print("Arguments: ========") -print(args) - - -# load dataset -if args.cross_lingual: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.cross_lingual: - en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer - en_tokenizer.pad_token = en_tokenizer.eos_token - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -logger.info("Tokenizing the dataset...") -if args.do_train: - if args.cross_lingual: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) - else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) - - - small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) - small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) - logger.info(full_train_dataset[0]) - logger.info(full_train_dataset[100]) - -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500 if not args.use_partial_data else 10, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(args, inference=False): - # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one - # even when we call load_adapter - if not args.original_model == args.adapted_model and not args.cross_lingual: - wte = torch.load(f'{args.adapted_model}/embedding.pt') - wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') - - model = GPT2ForSequenceClassification.from_pretrained(args.original_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - if inference or not args.cross_lingual: - # need to load embedding/adapters from the model adapted to the new language - causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) - causal_lm_model.resize_token_embeddings(len(tokenizer)) - if not args.original_model == args.adapted_model: - causal_lm_model.transformer.wte = wte - causal_lm_model.transformer.wpe = wpe - if args.madx_lang_adapter: - adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - model.transformer = causal_lm_model.transformer - model.set_active_adapters(adapter_name) - - if not inference: - #if not args.cross_lingual: normally need to add adapter in any case - # normally this is already done, why use adapter_lang_name here? - #if args.madx_lang_adapter: - # adapter_name = model.load_adapter(args.madx_lang_adapter, - # config="pfeiffer+inv", - # load_as=args.adapter_lang_name) - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - - - print("🔥 ==================== Training: ==================== 🔥") - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - print(model) - else: - #if args.madx_lang_adapter: - assert args.pretrained_adapters_dir - # normally this is done in any case - #adapter_name = model.load_adapter(args.madx_lang_adapter) - #model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - #else: - # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") - # # not sure what happens here - # # for TGT -> TGT supervised finetuning setting, change adapter_name - # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") - # model.set_active_adapters(adapter_name) - print(model) - - return model - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = AdapterTrainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - if args.madx_lang_adapter: - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - model = load_model(args, inference=True) - training_args.report_to = list() - - trainer = AdapterTrainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) diff --git a/scripts/eval_xnli/adapters_xnli_de_vn.py b/scripts/eval_xnli/adapters_xnli_de_vn.py deleted file mode 100644 index 3e29ddd..0000000 --- a/scripts/eval_xnli/adapters_xnli_de_vn.py +++ /dev/null @@ -1,229 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer, AdapterTrainer -from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--adapted_model") -parser.add_argument("--original_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--cross_lingual", default=False, action="store_true") - -finetune_strategies = ["whole", "lang_adapters", "task_adapters"] -parser.add_argument("--madx_lang_adapter") -#parser.add_argument("--adapter_lang_name", required=True) -- why is this required?? -parser.add_argument("--finetune_strategies", choices=finetune_strategies, required=True) - -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -if args.original_model is None: - # here: because the wpe is not saved, adapted_model is the original bigsciece model - args.original_model = args.adapted_model - -print("Arguments: ========") -print(args) - - -# load dataset -if args.cross_lingual: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.cross_lingual: - en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer - en_tokenizer.pad_token = en_tokenizer.eos_token - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -logger.info("Tokenizing the dataset...") -if args.do_train: - if args.cross_lingual: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) - else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) - - - small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) - small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) - logger.info(full_train_dataset[0]) - logger.info(full_train_dataset[100]) - -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500 if not args.use_partial_data else 10, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(args, inference=False): - # FIXME: if we load with GPT2ForSequenceClassification, the embeddings are the original one - # even when we call load_adapter - if not args.original_model == args.adapted_model and not args.cross_lingual: - wte = torch.load(f'{args.adapted_model}/embedding.pt') - wpe = torch.load(f'{args.adapted_model}/positional_embedding.pt') - - model = GPT2ForSequenceClassification.from_pretrained(args.original_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - if inference or not args.cross_lingual: - # need to load embedding/adapters from the model adapted to the new language - causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) - causal_lm_model.resize_token_embeddings(len(tokenizer)) - if not args.original_model == args.adapted_model: - causal_lm_model.transformer.wte = wte - causal_lm_model.transformer.wpe = wpe - if args.madx_lang_adapter: - adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - model.transformer = causal_lm_model.transformer - model.set_active_adapters(adapter_name) - - if not inference: - #if not args.cross_lingual: normally need to add adapter in any case - # normally this is already done, why use adapter_lang_name here? - #if args.madx_lang_adapter: - # adapter_name = model.load_adapter(args.madx_lang_adapter, - # config="pfeiffer+inv", - # load_as=args.adapter_lang_name) - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - - - print("🔥 ==================== Training: ==================== 🔥") - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - print(model) - else: - #if args.madx_lang_adapter: - assert args.pretrained_adapters_dir - # normally this is done in any case - #adapter_name = model.load_adapter(args.madx_lang_adapter) - #model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - #else: - # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") - # # not sure what happens here - # # for TGT -> TGT supervised finetuning setting, change adapter_name - # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") - # model.set_active_adapters(adapter_name) - print(model) - - return model - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = AdapterTrainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - if args.madx_lang_adapter: - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - model = load_model(args, inference=True) - training_args.report_to = list() - - trainer = AdapterTrainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) diff --git a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh b/scripts/eval_xnli/run_eval_xnli_zero_shot.sh deleted file mode 100644 index 855cde9..0000000 --- a/scripts/eval_xnli/run_eval_xnli_zero_shot.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -#SBATCH -p gpu -#SBATCH --gres="gpu:1" -#SBATCH --mem=100g - -#SBATCH --mail-type=BEGIN,END,FAIL -#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com -#SBATCH --constraint="gpu_v100&gpu_32g" - -FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience -# Set up the environment by loading modules -source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate - -# XNLI (Cross-Lingual and Supervised Setting) - -LANG=$1 -data_sample=$2 -vocabsize=$3 -adapter_reduction_factor=$4 - -ch=118500 - - -adapter_config="pfeiffer+inv" -model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" -ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name -TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k -CACHE_DIR="${FP_BIGS}/data/" -data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" -data_tok_dir=${data_dir}/lng_tok - -MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" -XNLI_ZH_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full # output directory -LR=1e-5 - -# language adapters checkpoint folder -MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_${LANG}" - -# we finetune task adapters for XNLI -FT_STRATEGIES="task_adapters" - -outdir=$MODEL_DIR/xnli_eval_zero_shot -# evaluate zero-shot training -python adapters_xnli_de_vn.py \ -$XNLI_ZH_DIR \ ---lang $LANG \ ---cache_dir $CACHE_DIR \ ---num_train_epochs 2 \ ---learning_rate $LR \ ---per_device_train_batch_size 8 \ ---gradient_accumulation_steps 4 \ ---pretrained_model $MODEL_DIR \ ---original_model $ORIGINAL_MODEL \ ---tokenizer $TOKENIZER_DIR \ ---do_eval_after_train \ ---madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ ---finetune_strategies "task_adapters" \ ---zero_shot &> $XNLI_ZH_DIR/$( basename $data_dir )-$( basename $MODEL_DIR )_eval.log - - - - -#Remove `--zero_shot` for supervised finetuning setting. - -### Zero-shot Prompt-based Setting - -#See branch [`bigscience-lm-adapt`](https://github.com/yongzx/lm-evaluation-harness/tree/bigscience-lm-adapt) of yongzx/lm-evaluation-harness (forked repo). diff --git a/scripts/eval_xnli/train_xnli_zero_shot.sh b/scripts/eval_xnli/train_xnli_zero_shot.sh deleted file mode 100644 index 8a9445c..0000000 --- a/scripts/eval_xnli/train_xnli_zero_shot.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# Ask for the GPU partition and 1 GPU -#SBATCH -p gpu -#SBATCH --gres="gpu:1" - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J run_clm_madx - -#SBATCH --mail-type=BEGIN,END,FAIL -#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com -#SBATCH --constraint="gpu_v100&gpu_32g" - -# XNLI (Cross-Lingual and Supervised Setting) - -FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience -# Set up the environment by loading modules -source $FP_BIGS/multilingual-modeling/scripts/env/bin/activate - -LANG=$1 -data_sample=$2 -vocabsize=$3 -adapter_reduction_factor=$4 - -ch=118500 - - -adapter_config="pfeiffer+inv" -model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" -ORIGINAL_MODEL=${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name -TOKENIZER_DIR="${FP_BIGS}/tokenizers/${LANG}_oscar_${data_sample}_tokenizer_${vocabsize}" #default tok settings with vocab size = 24k -CACHE_DIR="${FP_BIGS}/data/" -data_dir="${FP_BIGS}/exp-ext-${LANG}/madx-bs1b3-multi-ch${ch}-${LANG}-sample${data_sample}-$( basename $TOKENIZER_DIR )" -data_tok_dir=${data_dir}/lng_tok - -MODEL_DIR="${data_dir}/bs1.3B${ch}-${adapter_config}-${adapter_reduction_factor}-es5" -OUTPUT_DIR=$ORIGINAL_MODEL/xnli_task_adapter_full -LR=1e-5 - -# language adapters checkpoint folder -MADX_LANG_ADAPTER_NAME="$MODEL_DIR/oscar_de" - -# we finetune task adapters for XNLI -FT_STRATEGIES="task_adapters" - -mkdir -p $OUTPUT_DIR -python adapters_xnli_de_vn.py \ -$OUTPUT_DIR \ ---lang $LANG \ ---cache_dir $CACHE_DIR \ ---num_train_epochs 2 \ ---learning_rate $LR \ ---per_device_train_batch_size 8 \ ---gradient_accumulation_steps 4 \ ---pretrained_model $MODEL_DIR \ ---original_model $ORIGINAL_MODEL \ ---tokenizer $TOKENIZER_DIR \ ---do_train \ ---do_eval_after_train \ ---madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ ---finetune_strategies "task_adapters" \ ---zero_shot &> $OUTPUT_DIR/train.log - diff --git a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh deleted file mode 100644 index a0afcd8..0000000 --- a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -#SBATCH -p gpu -#SBATCH --gres="gpu:1" -#SBATCH --ntasks=16 -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J eval_retrieval_acc - -# Specify an output file -#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/eval_retrieval_acc-%j.out -#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/eval_retrieval_acc-%j.err - -#SBATCH --mail-type=BEGIN,END,FAIL -#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com - - -model=$1 -dataset=$2 -outdir=retrieval_acc_${model}-${dataset} -mkdir $outdir -python eval_sentence_retrieval.py $outdir --pretrained_model $model --tokenizer $model --dataset $dataset diff --git a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh deleted file mode 100644 index 5c7efc2..0000000 --- a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc_bs.sh +++ /dev/null @@ -1,10 +0,0 @@ -for model in tr5b-1B3-multilingual-alpha-checkpoints; do - for ch in 12000 55500 99000 100500 117000 118500; do - mname=${model}/ch${ch} - for dataset in flores ted_multi; do - outdir=retrieval_acc_${model}-${dataset} - mkdir -p $outdir - sbatch compute_retrieval_acc.sh ${mname} ${dataset} - done - done -done diff --git a/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py deleted file mode 100644 index 3fdf4e3..0000000 --- a/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py +++ /dev/null @@ -1,222 +0,0 @@ -import logging -import argparse -import os -from datasets import load_dataset -from collections import namedtuple -import torch -import numpy as np -from transformers import BertTokenizer, BertModel -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM -import matplotlib -import matplotlib.pyplot as plt -import seaborn as sns -import pandas as pd -import os.path -import sys -from loguru import logger -import random -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--pretrained_model", default="bert-base-multilingual-cased") -parser.add_argument("--tokenizer", default="bert-base-multilingual-cased") -parser.add_argument("--dataset", default="ted_multi") -parser.add_argument("--device", default="cuda") -args = parser.parse_args() - -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) -ted_lngs = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] -flores_lng = ["amh", "bos", "cat", "eng", "spa", "fra", "hin", "ind", "jpn", "por", "swh", "vie", "urd"] -bs_languages = ["id", "eu", "vi", "zh", "ur", "es", "ca", "pt", "fr", "en", "hi", "ar", "bn"] -lngcode_map = {"am":"amh", "bn":"bos", "ca":"cat", "en":"eng", "es":"spa", "fr": "fra", "hi": "hin", "id": "ind", "ja": "jpn", "pt": "por", "ur":"urd", "vi":"vie" } - - -print("Arguments: ========") -print(args) - - -def load_dataset_(args): - if args.dataset == "ted_multi": - return load_dataset_ted(args) - if args.dataset == "flores": - return load_dataset_flores(args) - - -def load_dataset_flores_for_lng(args, lng): - dataset = load_dataset("gsarti/flores_101", lngcode_map[lng])['dev'] - return dataset - -def load_dataset_flores(args): - dataset = {} - for lng in bs_languages: - if lng in lngcode_map: - load_dataset_flores_for_lng(args, lng) - return dataset - -def load_dataset_ted(args): - dataset = load_dataset("ted_multi")['validation'] - return dataset - -def get_talks(dataset, nb_talks): - talk_names = [] - for t in dataset['talk_name']: - if len(talk_names) < nb_talks and not t in talk_names: - talk_names.append(t) - - - print([(t1, len([t for t in dataset['talk_name'] if t == t1])) for t1 in talk_names]) - return talk_names - -def load_model(args): - if "xlm" in args.pretrained_model or "bert" in args.pretrained_model: - model = AutoModelForMaskedLM.from_pretrained(args.pretrained_model) - else: - model = AutoModelForCausalLM.from_pretrained(args.pretrained_model) - model.config.output_hidden_states=True - return model.to(args.device) - -Sample = namedtuple( - "Sample", - ("id", "hidden_state") -) - -def load_from_file(fname): - return torch.load(fname) - - -def get_hidden_states(args, model): - if args.dataset == "ted_multi": - dataset = load_dataset_(args) - nb_talks = 2 - talks = get_talks(dataset, nb_talks) - - emb = get_hidden_states_for_talks(dataset, model, talks, args.pretrained_model) - - outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-talks-valid-{len(talks)}" - - elif args.dataset == "flores": - nb_samples = 200 - emb = get_hidden_states_for_flores(args, model, args.pretrained_model, nb_samples = nb_samples) - outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-flores-{nb_samples}" - - retrieval_acc = {} - nb_states = model.config.num_hidden_layers - fig, ax = plt.subplots(1, int(nb_states/step), figsize=(12*int(nb_states/step), 10)) - - - with open(f"{outname}.log", 'w') as fout: - for state in range(0, nb_states, step): - plot_retrieval_acc(state, emb, ax[int(state/step)], fout) - - fig.tight_layout() - plt.savefig(f'{outname}-heatmap.png') - - -def get_hidden_states_for_flores(args, model, mname, nb_samples=50): - emb = {} - hidden_state_size = model.config.num_hidden_layers - for lng in bs_languages: - if lng in lngcode_map: - fname = f"{args.output_dir}/flores-{lng}-{nb_samples}-{mname.replace('/','-')}.pt" - if os.path.isfile(fname): - emb[lng] = load_from_file(fname) - else: - dataset = load_dataset_flores_for_lng(args, lng) - emb[lng] = {} - for state in range(hidden_state_size): - emb[lng][state] = [] - for i, sid in enumerate(dataset['id'][:nb_samples]): - t = dataset['sentence'][i] - x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) - out = model(x) - for state in range(hidden_state_size): - hs = torch.mean(out.hidden_states[state][0][1:-1], dim=0).detach() - emb[lng][state].append(Sample(sid, hs)) - torch.save(emb[lng], fname) - return emb - - -def get_hidden_states_for_talks(dataset, model, talks, mname): - emb = {} - hidden_state_size = model.config.num_hidden_layers - fname = f"{args.output_dir}/ted_multi-{mname.replace('/','-')}-ted_multi-{len(talks)}.pt" - if os.path.isfile(fname): - emb = load_from_file(fname) - return emb - for sid, sample in enumerate(dataset): - if sample['talk_name'] in talks: - tsample = sample['translations'] - for i, lng in enumerate(tsample['language']): - if lng in bs_languages: - t = tsample['translation'][i] - x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) - if not lng in emb: - emb[lng] = {} - for state in range(hidden_state_size): - emb[lng][state] = [] - out = model(x) - for state in range(hidden_state_size): - hs = torch.mean(out.hidden_states[state][0], dim=0).detach() - emb[lng][state].append(Sample(sid, hs)) - torch.save(emb, fname) - return emb - - -def compute_sent_retrieval_acc(lng1, lng2, emb, state, out): - cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) - E1 = torch.stack([s[1] for s in emb[lng1][state]]) - E2 = torch.stack([s[1] for s in emb[lng2][state]]) - #cos_matrix = [[cos(E2[i],E1[j]) for i in range(E2.shape[0]) ] for j in range(E1.shape[0])] - match = 0 - intersection_ids = set([emb[lng1][state][i][0] for i in range(E1.shape[0])]).intersection( - set([emb[lng2][state][i][0] for i in range(E2.shape[0])]) - ) - if len(intersection_ids)>0: - random_acc = 1/len(intersection_ids) - for i in range(E1.shape[0]): - if emb[lng1][state][i][0] in intersection_ids: - cos_sim = [cos(E2[j], E1[i]) for j in range(E2.shape[0])] - best_match = torch.argmax(torch.stack(cos_sim)) - if emb[lng2][state][best_match][0] == emb[lng1][state][i][0]: - match +=1 - acc = match/len(intersection_ids) - out.write(f"{lng1}-{lng2} = {acc} (random {random_acc} )\n") - return acc, len(intersection_ids) - else: - return 0, 0 - -def plot_retrieval_acc(state, emb, ax, out): - cmap="RdYlBu" - mean_per_state = 0 - for lng1 in emb: - if not lng1 in retrieval_acc: - retrieval_acc[lng1] = {} - for lng2 in emb: - lng2_chance = 1.0/len(emb[lng2][0]) - #if not lng1 == lng2: - acc, random_acc = compute_sent_retrieval_acc(lng1, lng2, emb, state, out) - retrieval_acc[lng1][lng2] = acc - #retrieval_acc[lng1]["random"] = lng2_chance - mean_acc = np.mean([v for v in retrieval_acc[lng1].values()]) - out.write(f"ACC per {lng1}, layer {state} = {mean_acc} \n" ) - mean_per_state +=mean_acc - mean_per_state = mean_per_state/len(emb.keys()) - out.write(f"ACC overall, layer {state} = {mean_per_state}\n" ) - m_res = pd.DataFrame(retrieval_acc) - m_res.columns=emb.keys() - m_res.index=emb.keys()#[e for e in emb.keys()]+["random"] - ax.set_title(f"state {state}") - sns.heatmap(m_res, ax=ax, annot=False, vmin=0, vmax=1.0, center=0, cmap=cmap) - - - -lngs2consider = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] -samples = 10 -model = load_model(args) -retrieval_acc = {} -step=1 -get_hidden_states(args, model) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index ba65010..f6547ce 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -487,12 +487,7 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): model.delete_embeddings('default') model.tie_weights() elif model_args.embedding_strategies == "replace": - # print(model.transformer.wte.weight) - # print(model.transformer.wpe.weight) model.resize_token_embeddings(len(tokenizer)) - model.transformer.wte.apply(model._init_weights) # reinitialize token embedding weights - # print(model.transformer.wte.weight) - # print(model.transformer.wpe.weight) trainable_params = 0 frozen_params = 0 diff --git a/scripts/madx_exp/madx_lngembft_clm.py b/scripts/madx_exp/madx_lngembft_clm.py deleted file mode 100644 index 45b7c35..0000000 --- a/scripts/madx_exp/madx_lngembft_clm.py +++ /dev/null @@ -1,617 +0,0 @@ -""" -Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py -""" - -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import torch -import pathlib - -import datasets -from datasets import load_dataset - -import transformers -import transformers.adapters.composition as ac -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AdapterTrainer, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - MultiLingAdapterArguments, - Trainer, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.adapters.configuration import AdapterConfig -from transformers.testing_utils import CaptureLogger -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.11.0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - block_size: Optional[int] = field( - default=None, - metadata={ - "help": "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def load_tokenizer(model_args): - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - return tokenizer - - - -def load_data(data_args, model_args): - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) - - if "validation" not in raw_datasets.keys(): - if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: - raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) - elif data_args.max_eval_samples is not None : - raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) - else: - raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) - - raw_datasets['validation'] = raw_datasets['test'] - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - return raw_datasets - -def load_model(model_args, tokenizer): - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - #TODO: remap embedding parameters - #if not tokenizer.name_or_path == model_args.model_name_or_path: - # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - - model.resize_token_embeddings(len(tokenizer)) - return model - -def preprocess_data(training_args, data_args, model_args, tokenizer): - with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") - if not tokenizer.name_or_path == model_args.model_name_or_path: - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") - - saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): - tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - logger.info("Sanity check: loaded tokenized_datasets") - else: - raw_datasets = load_data(data_args, model_args) - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - - text_column_name = "text" if "text" in column_names else column_names[0] - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - - def tokenize_function(examples): - - with CaptureLogger(tok_logger) as cl: - output = tokenizer(examples[text_column_name]) - # clm input could be much much longer than block_size - if "Token indices sequence length is longer than the" in cl.out: - tok_logger.warning( - "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." - ) - return output - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - logger.info("Sanity check: saved tokenized_datasets") - if "train" not in tokenized_datasets and training_args.do_train: - raise ValueError("--do_train requires a train dataset") - if "validation" not in tokenized_datasets and training_args.do_eval: - raise ValueError("--do_eval requires a validation dataset") - return tokenized_datasets - - -def get_lm_dataset(training_args, data_args, model_args, tokenizer): - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") - if not tokenizer.name_or_path == model_args.model_name_or_path: - saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): - lm_datasets = torch.load(str(saved_lm_datasets_fp)) - logger.info("Sanity check: loaded lm_datasets") - else: - - tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - torch.save(lm_datasets, saved_lm_datasets_fp) - logger.info("Sanity check: saved lm_datasets") - return lm_datasets - -def add_adapters(adapter_args, data_args, model): - # Setup adapters - if adapter_args.train_adapter: - task_name = data_args.dataset_name or "clm" - task_name += f"_{adapter_args.language}" - # check if adapter already exists, otherwise add it - if task_name not in model.config.adapters: - # resolve the adapter config - adapter_config = AdapterConfig.load( - adapter_args.adapter_config, - non_linearity=adapter_args.adapter_non_linearity, - reduction_factor=adapter_args.adapter_reduction_factor, - ) - # load a pre-trained from Hub if specified - if adapter_args.load_adapter: - model.load_adapter( - adapter_args.load_adapter, - config=adapter_config, - load_as=task_name, - ) - # otherwise, add a fresh adapter - else: - model.add_adapter(task_name, config=adapter_config) - # optionally load a pre-trained language adapter - if adapter_args.load_lang_adapter: - # resolve the language adapter config - lang_adapter_config = AdapterConfig.load( - adapter_args.lang_adapter_config, - non_linearity=adapter_args.lang_adapter_non_linearity, - reduction_factor=adapter_args.lang_adapter_reduction_factor, - ) - # load the language adapter from Hub - lang_adapter_name = model.load_adapter( - adapter_args.load_lang_adapter, - config=lang_adapter_config, - load_as=adapter_args.language, - ) - else: - lang_adapter_name = None - # Freeze all model weights except of those of this adapter - model.train_adapter([task_name]) - # Set the adapters to be used in every forward pass - if lang_adapter_name: - model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) - else: - model.set_active_adapters(task_name) - else: - if adapter_args.load_adapter or adapter_args.load_lang_adapter: - raise ValueError( - "Adapters can only be loaded in adapters training mode." - "Use --train_adapter to enable adapter training" - ) - trainable_params = 0 - frozen_params = 0 - emb_params = 0 - for name, param in model.named_parameters(): - if not param.requires_grad: - if not "wte" in name and not "lm_head" in name: - print(f"🥶 Frozen layer '{name}'") - frozen_params +=param.numel() - else: - param.requires_grad = True - print(f"🚀 Trainable layer '{name}'") - emb_params += param.numel() - else: - print(f"🚀 Trainable layer '{name}'") - trainable_params += param.numel() - print(f"Total frozen parameters: {frozen_params}") - print(f"Total emb parameters: {emb_params}") - print(f"Total trainable parameters: {trainable_params}") - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) - - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, adapter_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) - else: - model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() - training_args.data_dir = f'{training_args.output_dir}/../' - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"model_args {model_args}") - logger.info(f"data_args {data_args}") - logger.info(f"Training/evaluation parameters {training_args}") - logger.info(f"Adapter parameters {adapter_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - pass - #raise ValueError( - # f"Output directory ({training_args.output_dir}) already exists and is not empty. " - # "Use --overwrite_output_dir to overcome." - #) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - tokenizer = load_tokenizer(model_args) - model = load_model(model_args, tokenizer) - - add_adapters(adapter_args, data_args, model) - # Preprocessing the datasets. - lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) - if training_args.do_train: - train_dataset = lm_datasets["train"] - - if training_args.do_eval: - - eval_dataset = lm_datasets["validation"] - - - # Initialize our Trainer - trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer - trainer = trainer_class( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - ) - - logger.info(model) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - -# if training_args.push_to_hub: -# trainer.push_to_hub(**kwargs) -# else: -# trainer.create_model_card(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/scripts/madx_exp/madxlastlayer_lngembft_clm.py b/scripts/madx_exp/madxlastlayer_lngembft_clm.py deleted file mode 100644 index 7234cea..0000000 --- a/scripts/madx_exp/madxlastlayer_lngembft_clm.py +++ /dev/null @@ -1,618 +0,0 @@ -""" -Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py -""" - -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from typing import Optional - -import torch -import pathlib - -import datasets -from datasets import load_dataset - -import transformers -import transformers.adapters.composition as ac -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AdapterTrainer, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - MultiLingAdapterArguments, - Trainer, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.adapters.configuration import AdapterConfig -from transformers.testing_utils import CaptureLogger -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.11.0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - - block_size: Optional[int] = field( - default=None, - metadata={ - "help": "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def load_tokenizer(model_args): - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - return tokenizer - - - -def load_data(data_args, model_args): - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir - ) - - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args) - - if "validation" not in raw_datasets.keys(): - if data_args.max_eval_samples is not None and data_args.max_train_samples is not None: - raw_datasets = raw_datasets['train'].train_test_split(train_size = data_args.max_train_samples, test_size = data_args.max_eval_samples) - elif data_args.max_eval_samples is not None : - raw_datasets = raw_datasets['train'].train_test_split(test_size = data_args.max_eval_samples) - else: - raw_datasets = raw_datasets['train'].train_test_split(test_size = data.args.validation_split_percentage/100.0) - - raw_datasets['validation'] = raw_datasets['test'] - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - return raw_datasets - -def load_model(model_args, tokenizer): - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - #TODO: remap embedding parameters - #if not tokenizer.name_or_path == model_args.model_name_or_path: - # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - - model.resize_token_embeddings(len(tokenizer)) - return model - -def preprocess_data(training_args, data_args, model_args, tokenizer): - with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_datasets.pt") - if not tokenizer.name_or_path == model_args.model_name_or_path: - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_tokenized_datasets.pt") - - saved_tokenized_datasets_fp.parent.mkdir(parents=True, exist_ok=True) - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): - tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - logger.info("Sanity check: loaded tokenized_datasets") - else: - raw_datasets = load_data(data_args, model_args) - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - else: - column_names = raw_datasets["validation"].column_names - - text_column_name = "text" if "text" in column_names else column_names[0] - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - - def tokenize_function(examples): - - with CaptureLogger(tok_logger) as cl: - output = tokenizer(examples[text_column_name]) - # clm input could be much much longer than block_size - if "Token indices sequence length is longer than the" in cl.out: - tok_logger.warning( - "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." - ) - return output - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - logger.info("Sanity check: saved tokenized_datasets") - if "train" not in tokenized_datasets and training_args.do_train: - raise ValueError("--do_train requires a train dataset") - if "validation" not in tokenized_datasets and training_args.do_eval: - raise ValueError("--do_eval requires a validation dataset") - return tokenized_datasets - - -def get_lm_dataset(training_args, data_args, model_args, tokenizer): - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_datasets.pt") - if not tokenizer.name_or_path == model_args.model_name_or_path: - saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lngemb_lm_datasets.pt") - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): - lm_datasets = torch.load(str(saved_lm_datasets_fp)) - logger.info("Sanity check: loaded lm_datasets") - else: - - tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - torch.save(lm_datasets, saved_lm_datasets_fp) - logger.info("Sanity check: saved lm_datasets") - return lm_datasets - -def add_adapters(adapter_args, data_args, model): - # Setup adapters - if adapter_args.train_adapter: - task_name = data_args.dataset_name or "clm" - task_name += f"_{adapter_args.language}" - # check if adapter already exists, otherwise add it - if task_name not in model.config.adapters: - # resolve the adapter config - adapter_config = AdapterConfig.load( - adapter_args.adapter_config, - non_linearity=adapter_args.adapter_non_linearity, - reduction_factor=adapter_args.adapter_reduction_factor, - leave_out = [i for i in range(0,23)] - ) - # load a pre-trained from Hub if specified - if adapter_args.load_adapter: - model.load_adapter( - adapter_args.load_adapter, - config=adapter_config, - load_as=task_name, - ) - # otherwise, add a fresh adapter - else: - model.add_adapter(task_name, config=adapter_config) - # optionally load a pre-trained language adapter - if adapter_args.load_lang_adapter: - # resolve the language adapter config - lang_adapter_config = AdapterConfig.load( - adapter_args.lang_adapter_config, - non_linearity=adapter_args.lang_adapter_non_linearity, - reduction_factor=adapter_args.lang_adapter_reduction_factor, - ) - # load the language adapter from Hub - lang_adapter_name = model.load_adapter( - adapter_args.load_lang_adapter, - config=lang_adapter_config, - load_as=adapter_args.language, - ) - else: - lang_adapter_name = None - # Freeze all model weights except of those of this adapter - model.train_adapter([task_name]) - # Set the adapters to be used in every forward pass - if lang_adapter_name: - model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) - else: - model.set_active_adapters(task_name) - else: - if adapter_args.load_adapter or adapter_args.load_lang_adapter: - raise ValueError( - "Adapters can only be loaded in adapters training mode." - "Use --train_adapter to enable adapter training" - ) - trainable_params = 0 - frozen_params = 0 - emb_params = 0 - for name, param in model.named_parameters(): - if not param.requires_grad: - if not "wte" in name and not "lm_head" in name: - print(f"🥶 Frozen layer '{name}'") - frozen_params +=param.numel() - else: - param.requires_grad = True - print(f"🚀 Trainable layer '{name}'") - emb_params += param.numel() - else: - print(f"🚀 Trainable layer '{name}'") - trainable_params += param.numel() - print(f"Total frozen parameters: {frozen_params}") - print(f"Total emb parameters: {emb_params}") - print(f"Total trainable parameters: {trainable_params}") - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) - - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, adapter_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1]) - ) - else: - model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() - training_args.data_dir = f'{training_args.output_dir}/../' - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"model_args {model_args}") - logger.info(f"data_args {data_args}") - logger.info(f"Training/evaluation parameters {training_args}") - logger.info(f"Adapter parameters {adapter_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - pass - #raise ValueError( - # f"Output directory ({training_args.output_dir}) already exists and is not empty. " - # "Use --overwrite_output_dir to overcome." - #) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - tokenizer = load_tokenizer(model_args) - model = load_model(model_args, tokenizer) - - add_adapters(adapter_args, data_args, model) - # Preprocessing the datasets. - lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) - if training_args.do_train: - train_dataset = lm_datasets["train"] - - if training_args.do_eval: - - eval_dataset = lm_datasets["validation"] - - - # Initialize our Trainer - trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer - trainer = trainer_class( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - ) - - logger.info(model) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - -# if training_args.push_to_hub: -# trainer.push_to_hub(**kwargs) -# else: -# trainer.create_model_card(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/scripts/madx_exp/run_clm_madx_lngemb.sh b/scripts/madx_exp/run_clm_madx_lngemb.sh deleted file mode 100644 index 4e1315b..0000000 --- a/scripts/madx_exp/run_clm_madx_lngemb.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash - -# Ask for the GPU partition and 1 GPU -#SBATCH -p gpu -#SBATCH --gres="gpu:1" - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=16 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=100g - -# Specify a job name: -#SBATCH -J exp-009-run_clm_de_madx - -# Specify an output file -#SBATCH -o /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_de_madx-%j.out -#SBATCH -e /tmp-network/user/vnikouli/Projects/bigscience/logs/run_clm_de_madx-%j.err - -#SBATCH --mail-type=BEGIN,END,FAIL -#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com - -# Set up the environment by loading modules -source /tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/env/bin/activate -FP_BIGS=/tmp-network/user/vnikouli/Projects/bigscience/ - -data_sample=100000 -ch=$1 -lng=$2 -dataset=oscar -adapter_config="pfeiffer+inv" -adapter_reduction_factor=48 -model_name="tr5b-1B3-multilingual-alpha-checkpoints/ch${ch}" -tokenizer_dir="${FP_BIGS}/tokenizers/bigscience-1.3B-${lng}-tokenizer" -cache_dir="${FP_BIGS}/data/${dataset}_${lng}" -data_dir="${FP_BIGS}/exp-009/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}" -data_tok_dir="${FP_BIGS}/exp-009/madx-bs1b3-multi-ch${ch}-${lng}-sample${data_sample}/lng_tok" -output_dir="${data_dir}/withlngembft-lmhead-${adapter_config}-${adapter_reduction_factor}" -logging_dir="${FP_BIGS}/logs/exp-009/madx-bs1b3-multi-ch${ch}-${dataset}-${lng}-sample${data_sample}-withlngembft-lmhead-${adapter_config}-${adapter_reduction_factor}" - - -python $FP_BIGS/multilingual-modeling/scripts/madx_exp/madx_lngembft_clm.py \ - --fp16 \ - --model_name_or_path ${FP_BIGS}/multilingual-modeling/scripts/exp-009/$model_name \ - --tokenizer_name ${tokenizer_dir} \ - --dataset_name ${dataset} \ - --cache_dir $cache_dir \ - --dataset_config_name unshuffled_deduplicated_${lng} \ - --logging_dir ${logging_dir} \ - --report_to "tensorboard" \ - --learning_rate 0.001 \ - --do_train \ - --do_eval \ - --output_dir ${output_dir} \ - --preprocessing_num_workers 16 \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 2 \ - --eval_steps 5000 \ - --evaluation_strategy "steps" \ - --max_eval_samples 5000 \ - --train_adapter \ - --adapter_reduction_factor ${adapter_reduction_factor} \ - --language ${lng} \ - --num_train_epochs 6.0 \ - --adapter_config ${adapter_config} \ - --max_train_samples ${data_sample} diff --git a/scripts/xnli/README.md b/scripts/xnli/README.md deleted file mode 100644 index f368439..0000000 --- a/scripts/xnli/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# XNLI Evaluation - -Use `xnli_v2.py` to run the evaluation on XNLI. - -### With Language Adapters -``` -LANG="th" -CACHE_DIR="/users/zyong2/data/zyong2/huggingface/" -lr=5e-5 - -# Original BigScience model and language-specific tokenizer -MODEL_NAME="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-ckpt118500" -TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_24000" - -# saved language adapters -MADX_LANG_ADAPTER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/oscar_th" - -# saved embedding layers -WTE="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/transformer.wte.weight.pt" -WPE="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_adpt_100000samples/transformer.wpe.weight.pt" - -# output directory -OUTPUT_DIR="$FP_BIGS/data/processed/021/xnli_th_adpt_100000samples" - -mkdir -p $OUTPUT_DIR - -# remove --zero_shot for supervised finetuning setting; otherwise, it will be cross-lingual finetuning setting. -# use --use_partial_data to test the code - -python xnli_v2.py \ -$OUTPUT_DIR \ ---lang $LANG \ ---cache_dir $CACHE_DIR \ ---num_train_epochs 2 \ ---learning_rate $lr \ ---per_device_train_batch_size 8 \ ---gradient_accumulation_steps 4 \ ---pretrained_model $MODEL_NAME \ ---tokenizer $TOKENIZER_NAME \ ---do_train \ ---do_eval_after_train \ ---madx_lang_adapter $MADX_LANG_ADAPTER_NAME \ ---wte $WTE \ ---wpe $WPE \ ---zero_shot -``` - -### Embedding only approach (No Language Adapters) -``` -LANG="th" -CACHE_DIR="/users/zyong2/data/zyong2/huggingface/" -lr=5e-5 - -# Saved finetuned model and language-specific tokenizer -MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_emb_100000samples" -TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/th_oscar_tokenizer_24000" - -# output directory -OUTPUT_DIR="$FP_BIGS/data/processed/021/xnli_th_adpt_100000samples" - -mkdir -p $OUTPUT_DIR - -# remove --zero_shot for supervised finetuning setting; otherwise, it will be cross-lingual finetuning setting. -# use --use_partial_data to test the code - -python xnli_v2.py \ -$OUTPUT_DIR \ ---lang $LANG \ ---cache_dir $CACHE_DIR \ ---num_train_epochs 2 \ ---learning_rate $lr \ ---per_device_train_batch_size 8 \ ---gradient_accumulation_steps 4 \ ---pretrained_model $MODEL_NAME \ ---tokenizer $TOKENIZER_NAME \ ---do_train \ ---do_eval_after_train \ ---zero_shot \ ---use_partial_data -``` diff --git a/scripts/xnli/archive_xnli.py b/scripts/xnli/archive_xnli.py deleted file mode 100644 index 24aed27..0000000 --- a/scripts/xnli/archive_xnli.py +++ /dev/null @@ -1,222 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer, AdapterTrainer -from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--original_model") -parser.add_argument("--tokenizer") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") - -parser.add_argument("--madx_lang_adapter") -parser.add_argument("--adapter_lang_name", required=True) - -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -if args.original_model is None: - # here: because the wpe is not saved, pretrained_model is the original bigsciece model - args.original_model = args.pretrained_model - -print("Arguments: ========") -print(args) - - -# load dataset -if args.zero_shot: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: - en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer - en_tokenizer.pad_token = en_tokenizer.eos_token - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -logger.info("Tokenizing the dataset...") -if args.zero_shot: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) - -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500 if not args.use_partial_data else 10, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(args, inference=False): - # for adapters, when we load with GPT2ForSequenceClassification, the embeddings are the original model - if args.zero_shot and not inference: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - else: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - # this part is to replace the embedding layer - if args.madx_lang_adapter and (not args.zero_shot or (args.zero_shot and inference)): - # if not zero shot, that means that we need to replace the embedding layers during training - # we also need to replace embedding layers during inference - causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) - - # change the embedding layer of the original big science model - # by loading the adapters (which has saved lm_head) - causal_lm_model.resize_token_embeddings(len(tokenizer)) - causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - - model.resize_token_embeddings(len(tokenizer)) - model._modules['transformer']._modules['wte'] = causal_lm_model._modules['transformer']._modules['wte'] - - if not inference: - if not args.zero_shot and args.madx_lang_adapter: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv", - load_as=args.adapter_lang_name) - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - - print("🔥 ==================== Training: ==================== 🔥") - print(model) - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - else: - print("🔥 ==================== Inference: ==================== 🔥") - if args.madx_lang_adapter: - assert args.pretrained_adapters_dir - adapter_name = model.load_adapter(args.madx_lang_adapter) - model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - else: - # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") - - # for TGT -> TGT supervised finetuning setting, change adapter_name - adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") - model.set_active_adapters(adapter_name) - print(model) - - return model - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = AdapterTrainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - if args.do_eval_after_train: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - if args.madx_lang_adapter: - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") - - model = load_model(args, inference=True) - training_args.report_to = list() - - trainer = AdapterTrainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - print("Evaluate on Test:", trainer.evaluate()) \ No newline at end of file diff --git a/scripts/xnli/xnli_v2.py b/scripts/xnli/xnli_v2.py deleted file mode 100644 index 1887e83..0000000 --- a/scripts/xnli/xnli_v2.py +++ /dev/null @@ -1,213 +0,0 @@ -import logging -import argparse -import os - -from datasets import load_dataset -from datasets import load_metric -from collections import namedtuple - -import torch -import numpy as np -from transformers import TrainingArguments, Trainer, AdapterTrainer -from transformers import AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification, AutoModelForCausalLM - -# setup logging -import sys -from loguru import logger -logger.remove() -logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") - - -# parser -parser = argparse.ArgumentParser() -parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="de") -parser.add_argument("--cache_dir") -parser.add_argument("--num_train_epochs", type=int, default=30) -parser.add_argument("--learning_rate", type=float, default=1e-5) -parser.add_argument("--per_device_train_batch_size", type=int, default=4) -parser.add_argument("--gradient_accumulation_steps", type=int, default=4) -parser.add_argument("--pretrained_model") -parser.add_argument("--original_model") -parser.add_argument("--wte") -parser.add_argument("--wpe") -parser.add_argument("--tokenizer") -parser.add_argument("--madx_lang_adapter") -parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") -parser.add_argument("--do_predict", default=False, action="store_true") -parser.add_argument("--use_partial_data", default=False, action="store_true") -parser.add_argument("--zero_shot", default=False, action="store_true") - -args = parser.parse_args() -if args.do_eval_after_train: - args.do_predict = True - -if args.original_model is None: - # here: because the wpe is not saved, pretrained_model is the original bigsciece model - args.original_model = args.pretrained_model - -print("Arguments: ========") -print(args) - - -# load dataset -if args.zero_shot: - print("0️⃣ 0-Shot") - # 0-shot: use english as train and validation - xnli_en_dataset = load_dataset("xnli", "en", cache_dir=args.cache_dir) - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - assert args.lang != "en" - - train_dataset = xnli_en_dataset['train'] - val_dataset = xnli_en_dataset['validation'] - test_dataset = xnli_dataset['test'] -else: - print("👀 Supervised Training") - xnli_dataset = load_dataset("xnli", args.lang, cache_dir=args.cache_dir) - - train_dataset = xnli_dataset['train'] - val_dataset = xnli_dataset['validation'] - test_dataset = xnli_dataset['test'] - - -# load tokenizer -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir) -tokenizer.pad_token = tokenizer.eos_token # tokenizer.encode(tokenizer.eos_token) = [0] -if args.zero_shot: - en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir) # has to use AutoTokenizer instead of GPT2Tokenizer - en_tokenizer.pad_token = en_tokenizer.eos_token - -def tokenize_function(examples): - return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - -def en_tokenize_function(examples): - return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) - - -logger.info("Tokenizing the dataset...") -if args.zero_shot: - full_train_dataset = train_dataset.map(en_tokenize_function, batched=False) - full_val_dataset = val_dataset.map(en_tokenize_function, batched=False) -else: - full_train_dataset = train_dataset.map(tokenize_function, batched=False) - full_val_dataset = val_dataset.map(tokenize_function, batched=False) - -full_test_dataset = test_dataset.map(tokenize_function, batched=False) -small_train_dataset = full_train_dataset.shuffle(seed=42).select(range(100)) -small_val_dataset = full_val_dataset.shuffle(seed=42).select(range(100)) -small_test_dataset = full_test_dataset.shuffle(seed=42).select(range(100)) - -from datasets import load_metric -metric = load_metric("xnli") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -training_args = TrainingArguments( - args.output_dir, - overwrite_output_dir=True, - do_train=True, - do_eval=True, - eval_steps=500 if not args.use_partial_data else 10, - num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.per_device_train_batch_size, - gradient_accumulation_steps=args.gradient_accumulation_steps, - learning_rate=args.learning_rate, - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - logging_steps=500, - report_to="tensorboard", - logging_dir=f"{args.output_dir}/logs", - load_best_model_at_end=True, -) - -def load_model(args, inference=False): - # for adapters, when we load with GPT2ForSequenceClassification, the embeddings are the original model - if args.zero_shot and not inference: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=en_tokenizer.pad_token_id, - cache_dir=args.cache_dir) - else: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_model, - num_labels=3, - pad_token_id=tokenizer.pad_token_id, - cache_dir=args.cache_dir) - - # this part is to replace the embedding layer - if not args.zero_shot or (args.zero_shot and inference): - if args.wpe: - wpe = torch.load(args.wpe) - model._modules['transformer']._modules['wpe'].weight.data = wpe - logger.info(f"Loaded wpe from {args.wpe}") - if args.wte: - wte = torch.load(args.wte) - model._modules['transformer']._modules['wte'].weight.data = wte - logger.info(f"Loaded wte from {args.wte}") - - if not inference: - if not args.zero_shot and args.madx_lang_adapter: - adapter_name = model.load_adapter(args.madx_lang_adapter, - config="pfeiffer+inv") - model.add_adapter("xnli-task-adapter") - model.train_adapter("xnli-task-adapter") - - print("🔥 ==================== Training: ==================== 🔥") - print(model) - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - else: - print("🔥 ==================== Inference: ==================== 🔥") - assert args.pretrained_adapters_dir - if args.madx_lang_adapter: - adapter_name = model.load_adapter(args.madx_lang_adapter) - model.set_active_adapters(adapter_name) - - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/xnli-task-adapter") - model.set_active_adapters(adapter_name) - print(model) - - return model - -if args.do_train: - logger.info("Start Training") - model = load_model(args) - trainer = AdapterTrainer( - model=model, - args=training_args, - train_dataset=small_train_dataset if args.use_partial_data else full_train_dataset, - eval_dataset=small_val_dataset if args.use_partial_data else full_val_dataset, - compute_metrics=compute_metrics - ) - - trainer.train() - -if args.do_predict: - evaluation_dirs = list(sorted([ - checkpoint_dir - for checkpoint_dir in os.listdir(args.output_dir) - if checkpoint_dir.startswith('checkpoint-') - ], key=lambda x: int(x[len('checkpoint-'):]))) - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained task adapters from {args.pretrained_adapters_dir}") - - model = load_model(args, inference=True) - training_args.report_to = list() - - trainer = AdapterTrainer( - model=model, - args=training_args, - eval_dataset=small_test_dataset if args.use_partial_data else full_test_dataset, - compute_metrics=compute_metrics - ) - - result = trainer.evaluate() - - print("Evaluate on Test:", result) \ No newline at end of file From 614ca0d74a81ea4bd0229ff4843a607ca2ea9c24 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 2 Jun 2022 08:22:59 -0400 Subject: [PATCH 083/142] update tokenizer for BLOOM --- scripts/lang_adapt/tokenized4clm_sampled.py | 13 ++++--- scripts/lang_adapt/train_tokenizer.sh | 17 --------- scripts/lang_adapt/train_tokenizer_scratch.sh | 37 +++++++++++++++++-- 3 files changed, 41 insertions(+), 26 deletions(-) delete mode 100644 scripts/lang_adapt/train_tokenizer.sh diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index ccafa21..a1c4c87 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -33,6 +33,7 @@ parser.add_argument('--extend_vocab', action='store_true') # parser.add_argument('--replace_with_overlap', action='store_true') parser.add_argument('--sample_size', default=None, type=int) +parser.add_argument("--use_auth_token", default=False, action="store_true") args = parser.parse_args() lang = args.lang @@ -64,16 +65,18 @@ def batch_iterator(): yield sample unique_toks = set() +model_name = pathlib.Path(args.model).parts[-1] if args.extend_vocab: + # FIXME: needs to work on loading the original tokenizer. tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("✅ Trained tokenizer with len ", len(new_tokenizer)) added = tokenizer.add_tokens([tok for tok in new_tokenizer.vocab.keys()]) print(f"Overlap with previous vocab: {args.vocab_size - added}") - tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") - print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_extend") + tokenizer.save_pretrained(f"{args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_extend") + print(f"Saved tokenizer to {args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_extend") # elif args.replace_with_overlap: # # @@ -86,10 +89,10 @@ def batch_iterator(): # print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") else: - tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained(args.model, use_auth_token=args.use_auth_token) assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("Unique toks, ", len(unique_toks)) print("✅ Trained tokenizer with len ", len(new_tokenizer)) - new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_replace") - print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_replace") + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_replace") + print(f"Saved tokenizer to {args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_replace") diff --git a/scripts/lang_adapt/train_tokenizer.sh b/scripts/lang_adapt/train_tokenizer.sh deleted file mode 100644 index 7a95182..0000000 --- a/scripts/lang_adapt/train_tokenizer.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=cpu - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - - - -bs_dir=/tmp-network/user/vnikouli/Projects/bigscience -lng=$1 -sample_size=$2 -vocab_size=$3 -source $bs_dir/multilingual-modeling/scripts/env/bin/activate -python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size - diff --git a/scripts/lang_adapt/train_tokenizer_scratch.sh b/scripts/lang_adapt/train_tokenizer_scratch.sh index 354efbb..8187c2c 100644 --- a/scripts/lang_adapt/train_tokenizer_scratch.sh +++ b/scripts/lang_adapt/train_tokenizer_scratch.sh @@ -1,17 +1,46 @@ #!/bin/bash +# Request half an hour of runtime: +#SBATCH --time=1:59:00 + # Ask for the GPU partition and 1 GPU -#SBATCH --partition=cpu +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 # Use more memory (10GB) (CPU RAM): #SBATCH --mem=50g +# Specify a job name: +#SBATCH -J exp-020-tokenized4clm_sampled + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-020/tokenized4clm_sampled_scratch.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-020/tokenized4clm_sampled_scratch.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_try_lang_adapter/bin/activate -bs_dir=/tmp-network/user/vnikouli/Projects/bigscience +# call by `sbatch train_tokenizer_scratch.sh my 1000 5000` +cache_dir="/users/zyong2/data/zyong2/huggingface/" lng=$1 sample_size=$2 vocab_size=$3 -source $bs_dir/multilingual-modeling/scripts/env/bin/activate -python tokenized4clm_sampled.py --lang $lng --tokenizer_dir $bs_dir/tokenizers --hf_cache_dir $bs_dir/data --vocab_size $vocab_size --sample_size $sample_size +MODEL="bigscience/bloom-1b3" +python /users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/scripts/lang_adapt/tokenized4clm_sampled.py \ +--lang $lng \ +--model $MODEL \ +--tokenizer_dir /users/zyong2/data/zyong2/bigscience/data/processed/020 \ +--hf_cache_dir $cache_dir \ +--vocab_size $vocab_size \ +--sample_size $sample_size \ +--use_auth_token From 1fdd5357246b6da19be6e9c60a6a750d3de62f13 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 2 Jun 2022 09:09:10 -0400 Subject: [PATCH 084/142] language adaptation BLOOM --- scripts/lang_adapt/madx_run_clm.py | 40 ++++++++++++++++++++---------- scripts/lang_adapt/run_clm_emb.sh | 14 +++++------ 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index f6547ce..39bfa70 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -312,14 +312,19 @@ def load_model(model_args, tokenizer): def preprocess_data(training_args, data_args, model_args, tokenizer): with training_args.main_process_first(desc="dataset map tokenization"): - saved_tokenized_datasets_fp = pathlib.Path(f"{training_args.data_dir}/tokenized_data.pt") + # cache tokenized data + base_cache_dir = f"{model_args.cache_dir}/{data_args.dataset_name}/{data_args.dataset_config_name}" + saved_tokenized_datasets_fp = pathlib.Path(f"{base_cache_dir}/tokenized_data_{data_args.max_train_samples}train_{data_args.max_eval_samples}eval.pt") + if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) - logger.info(f"✅ loaded tokenized_data") + logger.info(f"✅ loaded tokenized_data from {saved_tokenized_datasets_fp}") else: raw_datasets = load_data(data_args, model_args) assert len(raw_datasets['train']) == data_args.max_train_samples - print(f"✅ Sanity check: loaded raw datasets have {data_args.max_train_samples} samples") + assert len(raw_datasets['validation']) == data_args.max_eval_samples + assert len(raw_datasets['test']) == data_args.max_eval_samples + print(f"✅ Sanity check: loaded raw datasets have {data_args.max_train_samples} training samples and {data_args.max_eval_samples} eval samples") # First we tokenize all the texts. if training_args.do_train: @@ -340,7 +345,7 @@ def tokenize_function(examples): "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." ) return output - + tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, @@ -351,12 +356,13 @@ def tokenize_function(examples): ) torch.save(tokenized_datasets, saved_tokenized_datasets_fp) - logger.info(f"✅ saved tokenized_data") + logger.info(f"✅ saved tokenized_data to {saved_tokenized_datasets_fp}") if "train" not in tokenized_datasets and training_args.do_train: raise ValueError("--do_train requires a train dataset") if "validation" not in tokenized_datasets and training_args.do_eval: raise ValueError("--do_eval requires a validation dataset") + return tokenized_datasets @@ -401,10 +407,12 @@ def group_texts(examples): # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): - saved_lm_datasets_fp = pathlib.Path(f"{training_args.data_dir}/lm_data.pt") + base_cache_dir = f"{model_args.cache_dir}/{data_args.dataset_name}/{data_args.dataset_config_name}" + saved_lm_datasets_fp = pathlib.Path(f"{base_cache_dir}/lm_data_{data_args.max_train_samples}train_{data_args.max_eval_samples}eval.pt") + if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): lm_datasets = torch.load(str(saved_lm_datasets_fp)) - logger.info("✅ loaded lm_data") + logger.info(f"✅ loaded lm_data from {saved_lm_datasets_fp}") else: tokenized_datasets = preprocess_data(training_args, data_args, model_args, tokenizer) lm_datasets = tokenized_datasets.map( @@ -415,7 +423,7 @@ def group_texts(examples): desc=f"Grouping texts in chunks of {block_size}", ) torch.save(lm_datasets, saved_lm_datasets_fp) - logger.info("✅ saved lm_data") + logger.info(f"✅ saved lm_data to {saved_lm_datasets_fp}") return lm_datasets def modify_model(adapter_args, data_args, model_args, tokenizer, model): @@ -493,7 +501,7 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): frozen_params = 0 emb_params = 0 for name, param in model.named_parameters(): - if "wte" in name or "wpe" in name: + if "word_embeddings" in name: param.requires_grad = True emb_params += param.numel() elif model_args.lang_adapt_strategies == "emb": @@ -616,10 +624,16 @@ def main(): trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) # save embedding and positional embedding (which is not saved by trainer) - embedding_name = "lng_emb" if model_args.embedding_strategies == "overlap-replace" else "default" - trainer.model.save_embeddings(trainer.args.output_dir, embedding_name) - torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check - torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') + + # FIXME: need to integrate adapterhub's save_embeddings + # embedding_name = "lng_emb" if model_args.embedding_strategies == "overlap-replace" else "default" + # trainer.model.save_embeddings(trainer.args.output_dir, embedding_name) + # torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check + # torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') + + torch.save(trainer.model.transformer.word_embeddings, f'{trainer.args.output_dir}/word_embeddings.pt') + torch.save(trainer.model.transformer.word_embeddings_layernorm, f'{trainer.args.output_dir}/word_embeddings_layernorm.pt') + metrics = train_result.metrics max_train_samples = ( diff --git a/scripts/lang_adapt/run_clm_emb.sh b/scripts/lang_adapt/run_clm_emb.sh index ce2f629..6936acc 100644 --- a/scripts/lang_adapt/run_clm_emb.sh +++ b/scripts/lang_adapt/run_clm_emb.sh @@ -27,23 +27,22 @@ set +a module load python/3.7.4 module load gitlfs/2.7.1 -source $FP_BIGS/env_lang_adapter/bin/activate +source $FP_BIGS/env_try_lang_adapter/bin/activate # axis LANG="my" DATA_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) -DATA_SAMPLES=100 VOCAB_SIZE=5000 CH=118500 -BIGS_MODEL="/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-ckpt${CH}" +BIGS_MODEL="bigscience/bloom-1b3" ADPT_STRATEGY="emb" EMBD_SRATEGY="replace" -tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_oscar_100000_tokenizer_${VOCAB_SIZE}_${EMBD_SRATEGY}" +tokenizer_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/tok_${BIGS_MODEL##*/}_${LANG}_oscar_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" cache_dir="/users/zyong2/data/zyong2/huggingface/" -output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${LANG}_emb_${DATA_SAMPLES}samples" -logging_dir="/users/zyong2/data/zyong2/bigscience/reports/020/${LANG}_emb_${DATA_SAMPLES}samples" +output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/020/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples" +logging_dir="/users/zyong2/data/zyong2/bigscience/reports/020/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples" mkdir -p $output_dir mkdir -p $logging_dir @@ -77,4 +76,5 @@ python /users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/scripts/lan --max_steps 50000 \ --lang_adapt_strategies $ADPT_STRATEGY \ --embedding_strategies $EMBD_SRATEGY \ - --load_best_model_at_end \ No newline at end of file + --load_best_model_at_end \ + --use_auth_token \ No newline at end of file From ff9c0a2b4d9936240570d10d0c782f8f7369f75b Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 2 Jun 2022 09:30:17 -0400 Subject: [PATCH 085/142] update --- scripts/requirements.txt | 137 ++------------------------------------- 1 file changed, 4 insertions(+), 133 deletions(-) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index a4e486a..fd49f62 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,133 +1,4 @@ -absl-py==0.14.0 -anyio==3.3.1 -argcomplete==1.12.3 -argon2-cffi==21.1.0 -attrs==21.2.0 -Babel==2.9.1 -backcall==0.2.0 -bleach==4.1.0 -cachetools==4.2.2 -certifi==2021.5.30 -cffi==1.14.6 -charset-normalizer==2.0.4 -click==8.0.1 -configparser==5.0.2 -datasets==1.11.0 -debugpy==1.4.3 -decorator==5.0.9 -defusedxml==0.7.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.3 -filelock==3.0.12 -fsspec==2021.8.1 -gitdb==4.0.7 -GitPython==3.1.24 -google-auth==1.35.0 -google-auth-oauthlib==0.4.6 -grpcio==1.41.0 -huggingface-hub==0.0.16 -idna==3.2 -importlib-metadata==4.8.1 -ipykernel==6.4.1 -ipython==7.27.0 -ipython-genutils==0.2.0 -ipywidgets==7.6.4 -jedi==0.18.0 -Jinja2==3.0.1 -joblib==1.0.1 -json5==0.9.6 -jsonschema==3.2.0 -jupyter==1.0.0 -jupyter-client==7.0.2 -jupyter-console==6.4.0 -jupyter-core==4.7.1 -jupyter-server==1.11.0 -jupyterlab==3.1.11 -jupyterlab-pygments==0.1.2 -jupyterlab-server==2.8.1 -jupyterlab-widgets==1.0.1 -lxml==4.6.3 -Markdown==3.3.4 -MarkupSafe==2.0.1 -matplotlib-inline==0.1.3 -mistune==0.8.4 -multiprocess==0.70.12.2 -nbclassic==0.3.1 -nbclient==0.5.4 -nbconvert==6.1.0 -nbformat==5.1.3 -nest-asyncio==1.5.1 -notebook==6.4.3 -numpy==1.21.2 -oauthlib==3.1.1 -packaging==21.0 -pandas==1.3.2 -pandocfilters==1.4.3 -parso==0.8.2 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -Pillow==8.3.2 -prometheus-client==0.11.0 -promise==2.3 -prompt-toolkit==3.0.20 -protobuf==3.18.0 -psutil==5.8.0 -ptyprocess==0.7.0 -pyarrow==5.0.0 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pycparser==2.20 -Pygments==2.10.0 -pyparsing==2.4.7 -pyrsistent==0.18.0 -python-dateutil==2.8.2 -python-dotenv==0.19.0 -pytz==2021.1 -PyYAML==5.4.1 -pyzmq==22.2.1 -qtconsole==5.1.1 -QtPy==1.11.0 -regex==2021.8.28 -requests==2.26.0 -requests-oauthlib==1.3.0 -requests-unixsocket==0.2.0 -rsa==4.7.2 -sacremoses==0.0.45 -scikit-learn==0.24.2 -scipy==1.7.1 -Send2Trash==1.8.0 -sentry-sdk==1.4.2 -shortuuid==1.0.1 -six==1.16.0 -sklearn==0.0 -smmap==4.0.0 -sniffio==1.2.0 -subprocess32==3.5.4 -tensorboard==2.6.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.0 -termcolor==1.1.0 -terminado==0.12.1 -testpath==0.5.0 -threadpoolctl==2.2.0 -tokenizers==0.10.3 -torch==1.9.0+cu111 -torchaudio==0.9.0 -torchvision==0.10.0+cu111 -tornado==6.1 -tqdm==4.62.2 -traitlets==5.1.0 -transformers @ git+https://github.com/huggingface/transformers@010965dcde8ce9526f6a7e6e2c3f36276c153708 -typing-extensions==3.10.0.2 -urllib3==1.26.6 -wandb==0.12.2 -wcwidth==0.2.5 -webencodings==0.5.1 -websocket-client==1.2.1 -Werkzeug==2.0.1 -widgetsnbextension==3.5.1 -xxhash==2.0.2 -yaspin==2.1.0 -zipp==3.5.0 +git+https://github.com/yongzx/adapter-transformers.git@bloom +datasets +torch --extra-index-url https://download.pytorch.org/whl/cu113 +tensorboardX \ No newline at end of file From 4c55c95b913586808cc3f473c75e5b08e3ded7cc Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Thu, 9 Jun 2022 11:13:24 +0200 Subject: [PATCH 086/142] updated scripts for sentence retrieval eval --- .../compute_retrieval_acc.sh | 16 ++ .../eval_sentence_retrieval.py | 230 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh create mode 100644 scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py diff --git a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh new file mode 100644 index 0000000..084e19d --- /dev/null +++ b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH -p gpu +#SBATCH --gres="gpu:1" +#SBATCH --mem=200g +#SBATCH --constraint="gpu_v100&gpu_32g" +# Specify an output file +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=vassilina.nikoulina@naverlabs.com + + +source /tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/env_bloom/bin/activate +model=$1 +dataset=$2 +outdir=$model/retrieval_acc-${dataset} +mkdir -p $outdir +python eval_sentence_retrieval.py $outdir --pretrained_model $model --tokenizer $model --dataset $dataset --pooling "max_min" diff --git a/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py new file mode 100644 index 0000000..fbe33d3 --- /dev/null +++ b/scripts/exp_sentence_retrievale_eval/eval_sentence_retrieval.py @@ -0,0 +1,230 @@ +import logging +import argparse +import os +from datasets import load_dataset +from collections import namedtuple +import torch +import numpy as np +from transformers import BertTokenizer, BertModel +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import os.path +import sys +from loguru import logger +import random +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--pretrained_model", default="bert-base-multilingual-cased") +parser.add_argument("--tokenizer", default="bert-base-multilingual-cased") +parser.add_argument("--dataset", default="ted_multi") +parser.add_argument("--device", default="cuda") +parser.add_argument("--pooling", default="mean") +args = parser.parse_args() + +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) +ted_lngs = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] +flores_lng = ["amh", "bos", "cat", "eng", "spa", "fra", "hin", "ind", "jpn", "por", "swh", "vie", "urd"] +bs_languages = ["id", "eu", "vi", "zh", "ur", "es", "ca", "pt", "fr", "en", "hi", "ar", "bn"] +lngcode_map = {"am":"amh", "bn":"bos", "ca":"cat", "en":"eng", "es":"spa", "fr": "fra", "hi": "hin", "id": "ind", "ja": "jpn", "pt": "por", "ur":"urd", "vi":"vie" } + + +print("Arguments: ========") +print(args) + + +def load_dataset_(args): + if args.dataset == "ted_multi": + return load_dataset_ted(args) + if args.dataset == "flores": + return load_dataset_flores(args) + + +def load_dataset_flores_for_lng(args, lng): + dataset = load_dataset("gsarti/flores_101", lngcode_map[lng])['dev'] + return dataset + +def load_dataset_flores(args): + dataset = {} + for lng in bs_languages: + if lng in lngcode_map: + load_dataset_flores_for_lng(args, lng) + return dataset + +def load_dataset_ted(args): + dataset = load_dataset("ted_multi")['validation'] + return dataset + +def get_talks(dataset, nb_talks): + talk_names = [] + for t in dataset['talk_name']: + if len(talk_names) < nb_talks and not t in talk_names: + talk_names.append(t) + + + print([(t1, len([t for t in dataset['talk_name'] if t == t1])) for t1 in talk_names]) + return talk_names + +def load_model(args): + if "xlm" in args.pretrained_model or "bert" in args.pretrained_model: + model = AutoModelForMaskedLM.from_pretrained(args.pretrained_model) + else: + model = AutoModelForCausalLM.from_pretrained(args.pretrained_model) + model.config.output_hidden_states=True + return model.to(args.device) + +Sample = namedtuple( + "Sample", + ("id", "hidden_state") +) + +def load_from_file(fname): + return torch.load(fname) + + +def get_hidden_states(args, model): + if args.dataset == "ted_multi": + dataset = load_dataset_(args) + nb_talks = 2 + talks = get_talks(dataset, nb_talks) + + emb = get_hidden_states_for_talks(dataset, model, talks, args.pretrained_model, pooling=args.pooling) + + outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-talks-valid-{len(talks)}-{args.pooling}" + + elif args.dataset == "flores": + nb_samples = 200 + emb = get_hidden_states_for_flores(args, model, args.pretrained_model, nb_samples = nb_samples, pooling=args.pooling) + outname = f"{args.output_dir}/{args.pretrained_model.replace('/','-')}-flores-{nb_samples}-{args.pooling}" + + retrieval_acc = {} + nb_states = model.config.num_hidden_layers + fig, ax = plt.subplots(1, int(nb_states/step), figsize=(12*int(nb_states/step), 10)) + + + with open(f"{outname}.log", 'w') as fout: + for state in range(0, nb_states, step): + plot_retrieval_acc(state, emb, ax[int(state/step)], fout) + + fig.tight_layout() + plt.savefig(f'{outname}-heatmap.png') + + +def get_hidden_states_for_flores(args, model, mname, nb_samples=50, pooling=""): + emb = {} + hidden_state_size = model.config.num_hidden_layers + for lng in bs_languages: + if lng in lngcode_map: + fname = f"{args.output_dir}/flores-{lng}-{nb_samples}-{mname.replace('/','-')}-{pooling}.pt" + if os.path.isfile(fname): + emb[lng] = load_from_file(fname) + else: + dataset = load_dataset_flores_for_lng(args, lng) + emb[lng] = {} + for state in range(hidden_state_size): + emb[lng][state] = [] + for i, sid in enumerate(dataset['id'][:nb_samples]): + t = dataset['sentence'][i] + x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) + out = model(x) + for state in range(hidden_state_size): + if "max_min" in fname: + hs = torch.cat([torch.max(out.hidden_states[state][0][1:-1], dim=0).values, torch.min(out.hidden_states[state][0][1:-1], dim=0).values]).detach() + else: + hs = torch.mean(out.hidden_states[state][0][1:-1], dim=0).detach() + emb[lng][state].append(Sample(sid, hs)) + torch.save(emb[lng], fname) + return emb + + +def get_hidden_states_for_talks(dataset, model, talks, mname, pooling=""): + emb = {} + hidden_state_size = model.config.num_hidden_layers + + fname = f"{args.output_dir}/ted_multi-{mname.replace('/','-')}-ted_multi-{len(talks)}-{pooling}.pt" + if os.path.isfile(fname): + emb = load_from_file(fname) + return emb + for sid, sample in enumerate(dataset): + if sample['talk_name'] in talks: + tsample = sample['translations'] + for i, lng in enumerate(tsample['language']): + if lng in bs_languages: + t = tsample['translation'][i] + x = tokenizer(t, return_tensors="pt").input_ids.to(model.device) + if not lng in emb: + emb[lng] = {} + for state in range(hidden_state_size): + emb[lng][state] = [] + out = model(x) + for state in range(hidden_state_size): + if "max_min" in fname: + hs = torch.cat([torch.max(out.hidden_states[state][0], dim=0).values, torch.min(out.hidden_states[state][0], dim=0).values]).detach() + else: + hs = torch.mean(out.hidden_states[state][0], dim=0).detach() + emb[lng][state].append(Sample(sid, hs)) + torch.save(emb, fname) + return emb + + +def compute_sent_retrieval_acc(lng1, lng2, emb, state, out): + cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) + E1 = torch.stack([s[1] for s in emb[lng1][state]]) + E2 = torch.stack([s[1] for s in emb[lng2][state]]) + #cos_matrix = [[cos(E2[i],E1[j]) for i in range(E2.shape[0]) ] for j in range(E1.shape[0])] + match = 0 + intersection_ids = set([emb[lng1][state][i][0] for i in range(E1.shape[0])]).intersection( + set([emb[lng2][state][i][0] for i in range(E2.shape[0])]) + ) + if len(intersection_ids)>0: + random_acc = 1/len(intersection_ids) + for i in range(E1.shape[0]): + if emb[lng1][state][i][0] in intersection_ids: + cos_sim = [cos(E2[j], E1[i]) for j in range(E2.shape[0])] + best_match = torch.argmax(torch.stack(cos_sim)) + if emb[lng2][state][best_match][0] == emb[lng1][state][i][0]: + match +=1 + acc = match/len(intersection_ids) + out.write(f"{lng1}-{lng2} = {acc} (random {random_acc} )\n") + return acc, len(intersection_ids) + else: + return 0, 0 + +def plot_retrieval_acc(state, emb, ax, out): + cmap="RdYlBu" + mean_per_state = 0 + for lng1 in emb: + if not lng1 in retrieval_acc: + retrieval_acc[lng1] = {} + for lng2 in emb: + lng2_chance = 1.0/len(emb[lng2][0]) + #if not lng1 == lng2: + acc, random_acc = compute_sent_retrieval_acc(lng1, lng2, emb, state, out) + retrieval_acc[lng1][lng2] = acc + #retrieval_acc[lng1]["random"] = lng2_chance + mean_acc = np.mean([v for v in retrieval_acc[lng1].values()]) + out.write(f"ACC per {lng1}, layer {state} = {mean_acc} \n" ) + mean_per_state +=mean_acc + mean_per_state = mean_per_state/len(emb.keys()) + out.write(f"ACC overall, layer {state} = {mean_per_state}\n" ) + m_res = pd.DataFrame(retrieval_acc) + m_res.columns=emb.keys() + m_res.index=emb.keys()#[e for e in emb.keys()]+["random"] + ax.set_title(f"state {state}") + sns.heatmap(m_res, ax=ax, annot=False, vmin=0, vmax=1.0, center=0, cmap=cmap) + + + +lngs2consider = ['am', 'ar', 'bn', 'ca', 'en', 'es', 'fr', 'hi', 'id', 'ja', 'pt', 'zh-cn', 'zh-tw', 'pt-br'] +samples = 10 +model = load_model(args) +retrieval_acc = {} +step=1 +get_hidden_states(args, model) From ef0b4acac9e7ab7b24865a0aaa00282581b39fa8 Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Mon, 13 Jun 2022 15:39:20 +0200 Subject: [PATCH 087/142] added prefix tuning option --- scripts/lang_adapt/madx_run_clm.py | 57 +++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 39bfa70..1fcef86 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -34,6 +34,8 @@ set_seed, ) from transformers.adapters.configuration import AdapterConfig +from transformers.adapters import PrefixTuningConfig + from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version @@ -109,7 +111,11 @@ class ModelArguments: default="", metadata={"help": "choose one of the two strategies - 'replace', 'extend', 'overlap-replace'"}, ) - + adapter_placement: str = field( + default="all", + metadata={"help": "list of layers where to place the adapters: all: use all layers, '17,24': list layers id separated by ','"}, + ) + def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): raise ValueError( @@ -433,19 +439,42 @@ def modify_model(adapter_args, data_args, model_args, tokenizer, model): # if "wte" not in name and "wpe" not in name and "lm_head" not in name: # param.requires_grad = False + def get_adapter_config(adapter_args, model_args): + if adapter_args.adapter_config == "prefix_tuning": + if model_args.adapter_placement == "all": + adapter_config = PrefixTuningConfig(bottleneck_size = 800) + else: + adapters2use = set([int(i) for i in model_args.adapter_placement.split(",")]) + adapter_config = PrefixTuningConfig(bottleneck_size = 800, + leave_out = [i for i in range(0,24) if not i in adapters2use] + ) + + + else: + + if model_args.adapter_placement == "all": + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor + ) + else: + adapters2use = set([int(i) for i in model_args.adapter_placement.split(",")]) + adapter_config = AdapterConfig.load( + adapter_args.adapter_config, + non_linearity=adapter_args.adapter_non_linearity, + reduction_factor=adapter_args.adapter_reduction_factor, + leave_out = [i for i in range(0,24) if not i in adapters2use] + ) + return adapter_config # Setup adapters if adapter_args.train_adapter: task_name = data_args.dataset_name or "clm" - task_name += f"_{adapter_args.language}" + task_name += f"_{adapter_args.adapter_config}_{adapter_args.language}" # check if adapter already exists, otherwise add it if task_name not in model.config.adapters: - # resolve the adapter config - adapter_config = AdapterConfig.load( - adapter_args.adapter_config, - non_linearity=adapter_args.adapter_non_linearity, - reduction_factor=adapter_args.adapter_reduction_factor, - ) + adapter_config = get_adapter_config(adapter_args, model_args) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( @@ -628,11 +657,13 @@ def main(): # FIXME: need to integrate adapterhub's save_embeddings # embedding_name = "lng_emb" if model_args.embedding_strategies == "overlap-replace" else "default" # trainer.model.save_embeddings(trainer.args.output_dir, embedding_name) - # torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check - # torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') - - torch.save(trainer.model.transformer.word_embeddings, f'{trainer.args.output_dir}/word_embeddings.pt') - torch.save(trainer.model.transformer.word_embeddings_layernorm, f'{trainer.args.output_dir}/word_embeddings_layernorm.pt') + + torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check + torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') + + # I assume + #torch.save(trainer.model.transformer.word_embeddings, f'{trainer.args.output_dir}/word_embeddings.pt') + #torch.save(trainer.model.transformer.word_embeddings_layernorm, f'{trainer.args.output_dir}/word_embeddings_layernorm.pt') metrics = train_result.metrics From dbea74899a72970561d63cea9a9be79047e64d9c Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Mon, 13 Jun 2022 15:43:13 +0200 Subject: [PATCH 088/142] backcompatibility with original BS model --- scripts/lang_adapt/madx_run_clm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 1fcef86..4acbc2e 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -653,17 +653,17 @@ def main(): trainer.save_model() # Saves the tokenizer too for easy upload # normally this part only saves the adapters? (TODO: check) # save embedding and positional embedding (which is not saved by trainer) - - # FIXME: need to integrate adapterhub's save_embeddings - # embedding_name = "lng_emb" if model_args.embedding_strategies == "overlap-replace" else "default" - # trainer.model.save_embeddings(trainer.args.output_dir, embedding_name) - torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check - torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') + # This part is used if we use initial BS 1b3 model (the one used for experiments reported in the paper) + if hasattr(trainer.model.transformer, "wte"): + torch.save(trainer.model.transformer.wte, f'{trainer.args.output_dir}/embedding_wte.pt') # for sanity check + if hasattr(trainer.model.transformer, "wpe"): + torch.save(trainer.model.transformer.wpe, f'{trainer.args.output_dir}/embedding_wpe.pt') - # I assume - #torch.save(trainer.model.transformer.word_embeddings, f'{trainer.args.output_dir}/word_embeddings.pt') - #torch.save(trainer.model.transformer.word_embeddings_layernorm, f'{trainer.args.output_dir}/word_embeddings_layernorm.pt') + # this part is used for BLOOM models + if hasattr(trainer.model.transformer, "word_embeddings"): + torch.save(trainer.model.transformer.word_embeddings, f'{trainer.args.output_dir}/word_embeddings.pt') + torch.save(trainer.model.transformer.word_embeddings_layernorm, f'{trainer.args.output_dir}/word_embeddings_layernorm.pt') metrics = train_result.metrics From 8f4815a9d0bb48abe4a5a69a1a61fc27874acece Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 15 Jun 2022 14:01:04 -0400 Subject: [PATCH 089/142] Debugging XNLI --- scripts/eval/eval.py | 474 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 474 insertions(+) create mode 100644 scripts/eval/eval.py diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py new file mode 100644 index 0000000..09cfc64 --- /dev/null +++ b/scripts/eval/eval.py @@ -0,0 +1,474 @@ +import logging +import argparse +import os + +from datasets import load_dataset +from datasets import load_metric +from collections import namedtuple + +import nltk +import torch +import numpy as np +from transformers import TrainingArguments, Trainer, Seq2SeqTrainer, AdapterTrainer, Seq2SeqAdapterTrainer, Seq2SeqTrainingArguments +from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForCausalLM +from transformers import DataCollatorForSeq2Seq +from transformers import ( + get_linear_schedule_with_warmup, + LogitsProcessorList, + BeamSearchScorer, + ForcedEOSTokenLogitsProcessor +) + +# setup logging +import sys +from loguru import logger +logger.remove() +logger.add(sys.stderr, format="{level} {level.icon} | [{time}] - {message}") + + +# AVAILABLE TASKS +XNLI = "xnli" +XLSUM = "csebuetnlp/xlsum" + +# parser +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +parser.add_argument("--lang", type=str, default="german") #xlsum requires a language name, not language code + +tasks = [XNLI, XLSUM] +parser.add_argument("--dataset", choices=tasks, required=True) + +parser.add_argument("--cache_dir") +parser.add_argument("--num_train_epochs", type=int, default=30) +parser.add_argument("--max_steps", type=int, default=-1) +parser.add_argument("--seed", type=int, default=42) +parser.add_argument("--learning_rate", type=float, default=1e-5) +parser.add_argument("--per_device_train_batch_size", type=int, default=4) +parser.add_argument("--gradient_accumulation_steps", type=int, default=4) +parser.add_argument("--per_device_eval_batch_size", type=int, default=1) +parser.add_argument("--adapted_model") +parser.add_argument("--original_model") +parser.add_argument("--tokenizer") +parser.add_argument("--do_train", default=False, action="store_true") +parser.add_argument("--do_eval_after_train", default=False, action="store_true") +parser.add_argument("--do_predict", default=False, action="store_true") +parser.add_argument("--use_partial_data", default=False, action="store_true") +parser.add_argument("--use_partial_train_data", type=int, default=100) +parser.add_argument("--use_partial_val_data", type=int, default=-1) +parser.add_argument("--use_partial_test_data", type=int, default=-1) +parser.add_argument("--cross_lingual", default=False, action="store_true") +parser.add_argument("--revision", type=str, default="main") +parser.add_argument("--local_rank", type=int) + +parser.add_argument("--madx_lang_adapter", default=None) +parser.add_argument("--no_task_adapter", default=False, action="store_true") +# parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--deepspeed", required=False) + +# mapping of tasks to model/trainer classes +model_class_mapping = {XNLI: AutoModelForSequenceClassification, XLSUM: AutoModelWithLMHead} +trainer_no_task_adpt_class_mapping = {XNLI: Trainer, XLSUM: Seq2SeqTrainer} +trainer_class_mapping = {XNLI: AdapterTrainer, XLSUM: Seq2SeqAdapterTrainer} +trainer_args_mapping = {XNLI: TrainingArguments, XLSUM: Seq2SeqTrainingArguments} + + +args = parser.parse_args() + +#### Process args +if args.do_eval_after_train: + args.do_predict = True + +if not args.madx_lang_adapter and args.no_task_adapter: + logger.warning("❗️ No 'madx_lang_adapter' loaded. This should be the baseline performance.") + +# additional args to pass to the model init. task-dependent +optional_model_kwargs = {} +optional_trainer_args = {} +optional_eval_args = {} +if args.dataset == XNLI: + optional_model_kwargs = {"num_labels": 3} +elif args.dataset == XLSUM: + optional_trainer_args = {"generation_max_length": 512 + 64, + "predict_with_generate":True, + "optim": "adafactor", + "lr_scheduler_type": "linear", + "warmup_steps": 0} + + +if args.local_rank: + torch.cuda.set_device(args.local_rank) + +if args.original_model is None: + # here: because the wpe is not saved, adapted_model is the original bigsciece model + args.original_model = args.adapted_model + +print("Arguments: ========") +print(args) + + +# load appropriate dataset +logger.info("Loading dataset...") + +# will need to rename splits if the dataset has different name for validation set +if args.cross_lingual: + logger.info("0️⃣ Cross Lingual setting") + # cross lingual: use english as train and validation set + en_dataset = load_dataset(args.dataset, "english" if args.dataset == XLSUM else "en", cache_dir=args.cache_dir) + dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) + + train_dataset = en_dataset["train"] + val_dataset = en_dataset["validation"] + test_dataset = dataset["test"] +else: + logger.info("👀 Supervised training setting") + dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) + + train_dataset = dataset["train"] + val_dataset = dataset["validation"] + test_dataset = dataset["test"] + +if args.use_partial_data: + logger.warning("🚨 Loading partial data!") + train_dataset = train_dataset.shuffle(seed=args.seed).select(range(args.use_partial_train_data)) + if args.use_partial_val_data != -1: + val_dataset = val_dataset.shuffle(seed=args.seed).select(range(args.use_partial_val_data)) + if args.use_partial_test_data != -1: + test_dataset = val_dataset.shuffle(seed=args.seed).select(range(args.use_partial_test_data)) + + +# load tokenizer +logger.info("Loading tokenizer...") +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision) +tokenizer.pad_token = tokenizer.eos_token + +# TODO: we probably need better code for this than multiple if-else statements +en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision) +en_tokenizer.pad_token = en_tokenizer.eos_token + +if args.dataset == XNLI: + def tokenize_function(examples): + return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + + def en_tokenize_function(examples): + return en_tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) + +elif args.dataset == XLSUM: + # for decoder only structure, input and target needs to have the same length + # also, unlike enc-dec model, we cannot feed the model some text and expect the model to generate only summary + # we need to have input = [text] + [padding] and the output be [text] + [summary]. + def tokenize_function(example): + text = tokenizer(f'{example["text"]}', max_length=511, truncation=True) + # TODO: sep_token instead of bos_token + input_text = tokenizer.decode(text['input_ids'], skip_special_tokens=True) + tokenizer.bos_token + + with tokenizer.as_target_tokenizer(): + summaries = tokenizer(f'{example["summary"]}', max_length=64, padding="max_length", truncation=True) + summaries_text = tokenizer.decode(summaries['input_ids'], skip_special_tokens=True) + + inputs = tokenizer(f'{input_text + summaries_text}', max_length=512 + 64, padding="max_length", truncation=True) + + inputs["labels"] = inputs["input_ids"] + + return inputs + + + def en_tokenize_function(example): + inputs = en_tokenizer(f'{example["text"]}', max_length=512, padding="max_length", truncation=True) + + with en_tokenizer.as_target_tokenizer(): + summaries = en_tokenizer(f'{example["summary"]}', max_length=512, padding="max_length", truncation=True) + + inputs["labels"] = summaries["input_ids"] + + return inputs + + +# tokenizing the dataset +logger.info("Tokenizing the dataset...") +if args.do_train: + if args.cross_lingual: + train_dataset = train_dataset.map(en_tokenize_function, batched=False) + val_dataset = val_dataset.map(en_tokenize_function, batched=False) + else: + train_dataset = train_dataset.map(tokenize_function, batched=False) + val_dataset = val_dataset.map(tokenize_function, batched=False) + + logger.info("Print tokenized dataset example ...") + logger.info(train_dataset[0]) + +test_dataset = test_dataset.map(tokenize_function, batched=False) + +# TODO: same as above, we probably need a better way than if-else statements. +# load metric +logger.info("Loading metric...") + +if args.dataset == XNLI: + metric = load_metric("xnli") + + def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +elif args.dataset == XLSUM: + metric = load_metric('rouge') + + def compute_metrics(eval_preds): + # TODO: note that this function calls trainer.model + preds, labels = eval_preds + + preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] + + result = metric.compute(predictions=preds, references=labels) + # TODO: need to confirm these are the right rouge values to report. Can report more ROUGE metrics if needed. + result = {key: value.mid.fmeasure * 100 for key, value in result.items()} + + return {k: round(v, 4) for k, v in result.items()} + + def compute_beam_search_metrics(model, dataset): + input_ids = torch.Tensor(dataset['input_ids']).type(torch.IntTensor) + model.cuda() + print(input_ids.shape) + print(model.device) + + beam_scorer = BeamSearchScorer( + batch_size=2, + num_beams=4, + device=model.device, + ) + + # instantiate logits processors + logits_processor = LogitsProcessorList( + [ + ForcedEOSTokenLogitsProcessor(512+64, eos_token_id=model.config.eos_token_id), + ] + ) + + preds = model.beam_search(input_ids[:2, :512].repeat_interleave(4, dim=0).cuda(), beam_scorer, logits_processor=logits_processor) + preds = tokenizer.batch_decode(preds) + print(preds) + assert False + labels = np.array(dataset['input_ids'])[:2, 512:] + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] + result = metric.compute(predictions=preds, references=labels) + print(result) + # print(preds) + # labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + # labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] + # labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] + + # result = metric.compute(predictions=preds, references=labels) + assert False + +else: + raise ValueError("Unknown dataset provided") + + +training_args = trainer_args_mapping[args.dataset]( + output_dir=args.output_dir, + overwrite_output_dir=True, + do_train=True, + do_eval=True, + eval_steps=500 if not args.use_partial_data else None, + num_train_epochs=args.num_train_epochs, + max_steps=args.max_steps, + per_device_train_batch_size=args.per_device_train_batch_size, + per_device_eval_batch_size=args.per_device_eval_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + learning_rate=args.learning_rate, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + logging_steps=500, + report_to="tensorboard", + logging_dir=f"{args.output_dir}/logs", + load_best_model_at_end=True, + deepspeed=args.deepspeed, + **optional_trainer_args, +) + +def load_model(args, inference=False): + # load model + pad_token_id = en_tokenizer.pad_token_id if (not inference and args.cross_lingual) else tokenizer.pad_token_id + model = model_class_mapping[args.dataset].from_pretrained(args.original_model, + pad_token_id=pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision, + **optional_model_kwargs) + + if args.no_task_adapter: + for name, param in model.named_parameters(): + param.requires_grad = False + if not inference and "word_embeddings" in name: + param.requires_grad = True + + # print out for debugging + if not param.requires_grad: + print(f"Frozen layer '{name}'") + else: + print(f"Trainable layer '{name}'") + return model + + + # obtain the embeddings for training (supervised setting) and inference + if "tr5b-1B3" in args.original_model: # previous 1.3B bigsience model + token_embedding = torch.load(f'{args.adapted_model}/embedding_wte.pt') + add_embedding = torch.load(f'{args.adapted_model}/embedding_wpe.pt') + elif "bloom" in args.original_model: + token_embedding = torch.load(f'{args.adapted_model}/word_embeddings.pt') + add_embedding = torch.load(f'{args.adapted_model}/word_embeddings_layernorm.pt') + + # load embedding layers + if not args.cross_lingual or inference: + print(model.transformer.wte.weight) + model.transformer.wte = wte + print(model.transformer.wte.weight) + assert False + # need to load embedding/adapters from the model adapted to the new language + causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + causal_lm_model.resize_token_embeddings(len(tokenizer)) + if not args.original_model == args.adapted_model: + causal_lm_model.transformer.wte = wte + causal_lm_model.transformer.wpe = wpe + if args.madx_lang_adapter: + adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.transformer = causal_lm_model.transformer + model.set_active_adapters(adapter_name) + + if not inference: + #if not args.cross_lingual: normally need to add adapter in any case + # normally this is already done, why use adapter_lang_name here? + #if args.madx_lang_adapter: + # adapter_name = model.load_adapter(args.madx_lang_adapter, + # config="pfeiffer+inv", + # load_as=args.adapter_lang_name) + model.add_adapter(f"{args.dataset}-task-adapter") + model.train_adapter(f"{args.dataset}-task-adapter") + + + print("🔥 ==================== Training: ==================== 🔥") + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + print(model) + else: + #if args.madx_lang_adapter: + assert args.pretrained_adapters_dir + # normally this is done in any case + #adapter_name = model.load_adapter(args.madx_lang_adapter) + #model.set_active_adapters(adapter_name) + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset}-task-adapter") + model.set_active_adapters(adapter_name) + #else: + # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") + # # not sure what happens here + # # for TGT -> TGT supervised finetuning setting, change adapter_name + # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") + # model.set_active_adapters(adapter_name) + print(model) + return model + + +if args.do_train: + logger.info("Starting training...") + model = load_model(args) + + # only use seq2seq collator if doing seq2seq task + if args.dataset == XLSUM: + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=-100, + ) + + if args.no_task_adapter: + logger.info(f"Using {trainer_no_task_adpt_class_mapping[args.dataset]} for training") + trainer = trainer_no_task_adpt_class_mapping[args.dataset]( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {}, + ) + else: + logger.info(f"Using {trainer_class_mapping[args.dataset]} for training") + trainer = trainer_class_mapping[args.dataset]( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {}, + ) + + trainer.train() + + + +if args.do_predict: + evaluation_dirs = list(sorted([ + checkpoint_dir + for checkpoint_dir in os.listdir(args.output_dir) + if checkpoint_dir.startswith("checkpoint-")], + key=lambda x: int(x[len('checkpoint-'):]))) + assert len(evaluation_dirs) > 0 + logger.info(f"Found {len(evaluation_dirs)} checkpoints") + + # load the last checkpoint. + args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" + logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + + model = load_model(args, inference=True) + training_args.report_to = list() + + if args.dataset == XLSUM: + # use beam search to get the results following the XLSUM paper + compute_beam_search_metrics(model, test_dataset) + assert False + + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=-100, + pad_to_multiple_of=8 if training_args.fp16 else None, + ) + + if args.no_task_adapter: + logger.info(f"Using {trainer_no_task_adpt_class_mapping[args.dataset]} for training") + trainer = trainer_no_task_adpt_class_mapping[args.dataset]( + model=model, + args=training_args, + eval_dataset=test_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {} + ) + else: + trainer = trainer_class_mapping[args.dataset]( + model=model, + args=training_args, + eval_dataset=test_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {} + + ) + + print("Evaluating on test set...", trainer.evaluate()) + From 17aae4198549c6cb50be7acb4347d3ea47cf320d Mon Sep 17 00:00:00 2001 From: Vassilina Nikoulina Date: Thu, 16 Jun 2022 18:17:24 +0200 Subject: [PATCH 090/142] seems like embedding+adapter wasn't working as expected in the previous version --- scripts/lang_adapt/madx_run_clm.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 4acbc2e..9024b13 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -516,21 +516,38 @@ def get_adapter_config(adapter_args, model_args): ) print(f"✅ Use Embedding Strategy: {model_args.embedding_strategies}") + if model_args.embedding_strategies == "overlap-replace": if not tokenizer.name_or_path == model_args.model_name_or_path: orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - model.add_embeddings('lng_emb', tokenizer, reference_embedding='default', reference_tokenizer=orig_tokenizer ) - model._active_embedding = "lng_emb" - model.delete_embeddings('default') + + ref_embedding = model.transformer.wte + model.resize_token_embeddings(len(tokenizer)) + overlap = set(tokenizer.vocab).intersection(set(orig_tokenizer.vocab)) + curr_vocab = tokenizer.vocab + orig_vocab = orig_tokenizer.vocab + for t in overlap: + model.transformer.wte.weight.data[curr_vocab[t]] = ref_embedding.weight[orig_vocab[t]] model.tie_weights() + elif model_args.embedding_strategies == "replace": model.resize_token_embeddings(len(tokenizer)) + model.tie_weights() + #if model_args.embedding_strategies == "overlap-replace": + # if not tokenizer.name_or_path == model_args.model_name_or_path: + # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + # model.add_embeddings('lng_emb', tokenizer, reference_embedding='default', reference_tokenizer=orig_tokenizer ) + # model._active_embedding = "lng_emb" + # model.delete_embeddings('default') + # model.tie_weights() + #elif model_args.embedding_strategies == "replace": + # model.resize_token_embeddings(len(tokenizer)) trainable_params = 0 frozen_params = 0 emb_params = 0 for name, param in model.named_parameters(): - if "word_embeddings" in name: + if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name: param.requires_grad = True emb_params += param.numel() elif model_args.lang_adapt_strategies == "emb": @@ -543,13 +560,10 @@ def get_adapter_config(adapter_args, model_args): print(f"🚀 Trainable layer '{name}'") trainable_params += param.numel() - if "wte" and "wpe" in name: - emb_params += param.numel() print(f"Total frozen parameters: {frozen_params}") print(f"Total emb parameters (wte, wpe): {emb_params}") print(f"Total trainable parameters: {trainable_params}") - def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. @@ -640,7 +654,6 @@ def main(): print("Model: 👇") print(model) - # Training if training_args.do_train: checkpoint = None From 86b304604cba1f471129c0831402462b9588b844 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 21 Jun 2022 03:47:40 -0400 Subject: [PATCH 091/142] add baseline arguments to eval --- scripts/eval/eval.py | 205 +++++++++++++++++++++++-------------------- 1 file changed, 111 insertions(+), 94 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 09cfc64..358c26e 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -33,7 +33,8 @@ # parser parser = argparse.ArgumentParser() parser.add_argument("output_dir") -parser.add_argument("--lang", type=str, default="german") #xlsum requires a language name, not language code +parser.add_argument("--train_lang", type=str) +parser.add_argument("--lang", type=str) #xlsum requires a language name, not language code tasks = [XNLI, XLSUM] parser.add_argument("--dataset", choices=tasks, required=True) @@ -50,7 +51,6 @@ parser.add_argument("--original_model") parser.add_argument("--tokenizer") parser.add_argument("--do_train", default=False, action="store_true") -parser.add_argument("--do_eval_after_train", default=False, action="store_true") parser.add_argument("--do_predict", default=False, action="store_true") parser.add_argument("--use_partial_data", default=False, action="store_true") parser.add_argument("--use_partial_train_data", type=int, default=100) @@ -63,6 +63,7 @@ parser.add_argument("--madx_lang_adapter", default=None) parser.add_argument("--no_task_adapter", default=False, action="store_true") # parser.add_argument("--adapter_lang_name", required=True) +parser.add_argument("--baseline", default=False, action="store_true") parser.add_argument("--deepspeed", required=False) # mapping of tasks to model/trainer classes @@ -75,10 +76,15 @@ args = parser.parse_args() #### Process args -if args.do_eval_after_train: - args.do_predict = True +if not args.cross_lingual and not args.train_lang: + args.train_lang = args.lang -if not args.madx_lang_adapter and args.no_task_adapter: +if args.train_lang != args.lang: + assert args.cross_lingual +elif args.train_lang == args.lang: + assert not args.cross_lingual + +if args.baseline: logger.warning("❗️ No 'madx_lang_adapter' loaded. This should be the baseline performance.") # additional args to pass to the model init. task-dependent @@ -94,7 +100,6 @@ "lr_scheduler_type": "linear", "warmup_steps": 0} - if args.local_rank: torch.cuda.set_device(args.local_rank) @@ -105,22 +110,23 @@ print("Arguments: ========") print(args) - # load appropriate dataset logger.info("Loading dataset...") # will need to rename splits if the dataset has different name for validation set if args.cross_lingual: - logger.info("0️⃣ Cross Lingual setting") + logger.info(f"0️⃣ Cross Lingual setting") + logger.info(f"train lang: {args.train_lang}; inference lang: {args.lang}") # cross lingual: use english as train and validation set - en_dataset = load_dataset(args.dataset, "english" if args.dataset == XLSUM else "en", cache_dir=args.cache_dir) + en_dataset = load_dataset(args.dataset, args.train_lang, cache_dir=args.cache_dir) dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) train_dataset = en_dataset["train"] val_dataset = en_dataset["validation"] test_dataset = dataset["test"] else: - logger.info("👀 Supervised training setting") + logger.info(f"👀 Supervised training setting") + logger.info(f"language: {args.lang})") dataset = load_dataset(args.dataset, args.lang, cache_dir=args.cache_dir) train_dataset = dataset["train"] @@ -128,13 +134,19 @@ test_dataset = dataset["test"] if args.use_partial_data: - logger.warning("🚨 Loading partial data!") train_dataset = train_dataset.shuffle(seed=args.seed).select(range(args.use_partial_train_data)) if args.use_partial_val_data != -1: val_dataset = val_dataset.shuffle(seed=args.seed).select(range(args.use_partial_val_data)) if args.use_partial_test_data != -1: - test_dataset = val_dataset.shuffle(seed=args.seed).select(range(args.use_partial_test_data)) + test_dataset = test_dataset.shuffle(seed=args.seed).select(range(args.use_partial_test_data)) + logger.warning("🚨 Loading partial data!") +if args.do_train: + logger.info(f"train = {len(train_dataset)} samples") +else: + logger.info(f"args.do_train = False") +logger.info(f"val = {len(val_dataset)} samples") +logger.info(f"test = {len(test_dataset)} samples") # load tokenizer logger.info("Loading tokenizer...") @@ -193,7 +205,7 @@ def en_tokenize_function(example): train_dataset = train_dataset.map(tokenize_function, batched=False) val_dataset = val_dataset.map(tokenize_function, batched=False) - logger.info("Print tokenized dataset example ...") + logger.info("Print one tokenized dataset example ...") logger.info(train_dataset[0]) test_dataset = test_dataset.map(tokenize_function, batched=False) @@ -298,8 +310,52 @@ def compute_beam_search_metrics(model, dataset): **optional_trainer_args, ) +def print_model_trainable_layers(model): + for name, param in model.named_parameters(): + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + else: + print(f"🚀 Trainable layer '{name}'") + def load_model(args, inference=False): - # load model + def load_task_specific_adapters(args, model, inference=False): + if not inference: + model.add_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") + model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") + return model + else: + print("yes") + adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset.split('/')[-1]}-task-adapter") + model.set_active_adapters(adapter_name) + return model + + def load_embedding_layers(args, tokenizer, model): + # # use original causal LM model to load the embedding layers + # causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) + # causal_lm_model.resize_token_embeddings(len(tokenizer)) + # if not args.original_model == args.adapted_model: + # causal_lm_model.transformer.wte = wte + # causal_lm_model.transformer.wpe = wpe + + if "tr5b-1B3" in args.original_model: # previous 1.3B bigsience model + token_embedding = torch.load(f'{args.adapted_model}/embedding_wte.pt') + add_embedding = torch.load(f'{args.adapted_model}/embedding_wpe.pt') + model.transformer.wte = token_embedding + model.transformer.wpe = add_embedding + + elif "bloom" in args.original_model: + token_embedding = torch.load(f'{args.adapted_model}/word_embeddings.pt') + add_embedding = torch.load(f'{args.adapted_model}/word_embeddings_layernorm.pt') + model.transformer.word_embeddings = token_embedding + model.transformer.word_embeddings_layernorm = add_embedding + + return model + + def load_language_adapters(args, model): + adapter_name = model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") + model.set_active_adapters(adapter_name) + return model + pad_token_id = en_tokenizer.pad_token_id if (not inference and args.cross_lingual) else tokenizer.pad_token_id model = model_class_mapping[args.dataset].from_pretrained(args.original_model, pad_token_id=pad_token_id, @@ -307,28 +363,20 @@ def load_model(args, inference=False): revision=args.revision, **optional_model_kwargs) - if args.no_task_adapter: - for name, param in model.named_parameters(): - param.requires_grad = False - if not inference and "word_embeddings" in name: - param.requires_grad = True - - # print out for debugging - if not param.requires_grad: - print(f"Frozen layer '{name}'") - else: - print(f"Trainable layer '{name}'") + # baseline: only need to add task-specific adapters + if args.baseline: + model = load_task_specific_adapters(args, model, inference) return model - - # obtain the embeddings for training (supervised setting) and inference - if "tr5b-1B3" in args.original_model: # previous 1.3B bigsience model - token_embedding = torch.load(f'{args.adapted_model}/embedding_wte.pt') - add_embedding = torch.load(f'{args.adapted_model}/embedding_wpe.pt') - elif "bloom" in args.original_model: - token_embedding = torch.load(f'{args.adapted_model}/word_embeddings.pt') - add_embedding = torch.load(f'{args.adapted_model}/word_embeddings_layernorm.pt') - + # adapted models + if not args.cross_lingual or inference: + model = load_embedding_layers(args, tokenizer, model) + model = load_language_adapters(args, model) + + model = load_task_specific_adapters(args, model, inference) + return model + + # load embedding layers if not args.cross_lingual or inference: print(model.transformer.wte.weight) @@ -336,16 +384,13 @@ def load_model(args, inference=False): print(model.transformer.wte.weight) assert False # need to load embedding/adapters from the model adapted to the new language - causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) - causal_lm_model.resize_token_embeddings(len(tokenizer)) - if not args.original_model == args.adapted_model: - causal_lm_model.transformer.wte = wte - causal_lm_model.transformer.wpe = wpe + if args.madx_lang_adapter: adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") model.transformer = causal_lm_model.transformer model.set_active_adapters(adapter_name) + if not inference: #if not args.cross_lingual: normally need to add adapter in any case # normally this is already done, why use adapter_lang_name here? @@ -353,17 +398,9 @@ def load_model(args, inference=False): # adapter_name = model.load_adapter(args.madx_lang_adapter, # config="pfeiffer+inv", # load_as=args.adapter_lang_name) - model.add_adapter(f"{args.dataset}-task-adapter") - model.train_adapter(f"{args.dataset}-task-adapter") - + model.add_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") + model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") - print("🔥 ==================== Training: ==================== 🔥") - for name, param in model.named_parameters(): - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - else: - print(f"🚀 Trainable layer '{name}'") - print(model) else: #if args.madx_lang_adapter: assert args.pretrained_adapters_dir @@ -385,6 +422,8 @@ def load_model(args, inference=False): if args.do_train: logger.info("Starting training...") model = load_model(args) + print("🔥 ==================== Training: ==================== 🔥") + print_model_trainable_layers(model) # only use seq2seq collator if doing seq2seq task if args.dataset == XLSUM: @@ -394,33 +433,20 @@ def load_model(args, inference=False): label_pad_token_id=-100, ) - if args.no_task_adapter: - logger.info(f"Using {trainer_no_task_adpt_class_mapping[args.dataset]} for training") - trainer = trainer_no_task_adpt_class_mapping[args.dataset]( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=val_dataset, - compute_metrics=compute_metrics, - # args for xlsum only - **{"data_collator": data_collator} if args.dataset == XLSUM else {}, - ) - else: - logger.info(f"Using {trainer_class_mapping[args.dataset]} for training") - trainer = trainer_class_mapping[args.dataset]( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=val_dataset, - compute_metrics=compute_metrics, - # args for xlsum only - **{"data_collator": data_collator} if args.dataset == XLSUM else {}, - ) + logger.info(f"Using {trainer_class_mapping[args.dataset]} for training") + trainer = trainer_class_mapping[args.dataset]( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {}, + ) trainer.train() - if args.do_predict: evaluation_dirs = list(sorted([ checkpoint_dir @@ -428,13 +454,14 @@ def load_model(args, inference=False): if checkpoint_dir.startswith("checkpoint-")], key=lambda x: int(x[len('checkpoint-'):]))) assert len(evaluation_dirs) > 0 - logger.info(f"Found {len(evaluation_dirs)} checkpoints") + print(f"Found {len(evaluation_dirs)} checkpoints") # load the last checkpoint. args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - logger.info(f"[Evaluation] Loading trained model from {evaluation_dirs[-1]}") + print(f"[Evaluation] Loading trained model from {args.pretrained_adapters_dir}") model = load_model(args, inference=True) + model.eval() training_args.report_to = list() if args.dataset == XLSUM: @@ -449,26 +476,16 @@ def load_model(args, inference=False): pad_to_multiple_of=8 if training_args.fp16 else None, ) - if args.no_task_adapter: - logger.info(f"Using {trainer_no_task_adpt_class_mapping[args.dataset]} for training") - trainer = trainer_no_task_adpt_class_mapping[args.dataset]( - model=model, - args=training_args, - eval_dataset=test_dataset, - compute_metrics=compute_metrics, - # args for xlsum only - **{"data_collator": data_collator} if args.dataset == XLSUM else {} - ) - else: - trainer = trainer_class_mapping[args.dataset]( - model=model, - args=training_args, - eval_dataset=test_dataset, - compute_metrics=compute_metrics, - # args for xlsum only - **{"data_collator": data_collator} if args.dataset == XLSUM else {} + eval_trainer = trainer_class_mapping[args.dataset]( + model=model, + args=training_args, + eval_dataset=test_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {} - ) + ) - print("Evaluating on test set...", trainer.evaluate()) + print("Evaluating on test set...") + print(eval_trainer.evaluate()) From 5c9501ba30fe418e4a1aa7fb7bf0c4c24f89a6ae Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 21 Jun 2022 07:16:58 -0400 Subject: [PATCH 092/142] xnli adappters --- scripts/eval/eval.py | 92 +++++++++++++------------------------------- 1 file changed, 26 insertions(+), 66 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 358c26e..b5bf265 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -10,7 +10,7 @@ import torch import numpy as np from transformers import TrainingArguments, Trainer, Seq2SeqTrainer, AdapterTrainer, Seq2SeqAdapterTrainer, Seq2SeqTrainingArguments -from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForCausalLM +from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForCausalLM, AutoModelForTokenClassification from transformers import DataCollatorForSeq2Seq from transformers import ( get_linear_schedule_with_warmup, @@ -47,7 +47,7 @@ parser.add_argument("--per_device_train_batch_size", type=int, default=4) parser.add_argument("--gradient_accumulation_steps", type=int, default=4) parser.add_argument("--per_device_eval_batch_size", type=int, default=1) -parser.add_argument("--adapted_model") +parser.add_argument("--adapted_model_dir") parser.add_argument("--original_model") parser.add_argument("--tokenizer") parser.add_argument("--do_train", default=False, action="store_true") @@ -67,25 +67,25 @@ parser.add_argument("--deepspeed", required=False) # mapping of tasks to model/trainer classes -model_class_mapping = {XNLI: AutoModelForSequenceClassification, XLSUM: AutoModelWithLMHead} +model_class_mapping = { + XNLI: AutoModelForSequenceClassification, + XLSUM: AutoModelWithLMHead +} trainer_no_task_adpt_class_mapping = {XNLI: Trainer, XLSUM: Seq2SeqTrainer} trainer_class_mapping = {XNLI: AdapterTrainer, XLSUM: Seq2SeqAdapterTrainer} trainer_args_mapping = {XNLI: TrainingArguments, XLSUM: Seq2SeqTrainingArguments} - args = parser.parse_args() #### Process args if not args.cross_lingual and not args.train_lang: args.train_lang = args.lang - -if args.train_lang != args.lang: - assert args.cross_lingual -elif args.train_lang == args.lang: - assert not args.cross_lingual +# ensure that only when cross_lingual, train_lang is not the same as lang +assert not ((args.train_lang != args.lang) ^ args.cross_lingual) if args.baseline: logger.warning("❗️ No 'madx_lang_adapter' loaded. This should be the baseline performance.") + assert not args.madx_lang_adapter # additional args to pass to the model init. task-dependent optional_model_kwargs = {} @@ -104,8 +104,8 @@ torch.cuda.set_device(args.local_rank) if args.original_model is None: - # here: because the wpe is not saved, adapted_model is the original bigsciece model - args.original_model = args.adapted_model + # here: because the wpe is not saved, adapted_model_dir is the original bigsciece model + args.original_model = args.adapted_model_dir print("Arguments: ========") print(args) @@ -324,31 +324,31 @@ def load_task_specific_adapters(args, model, inference=False): model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") return model else: - print("yes") adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset.split('/')[-1]}-task-adapter") model.set_active_adapters(adapter_name) return model def load_embedding_layers(args, tokenizer, model): + ###### legacy code # # use original causal LM model to load the embedding layers # causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) # causal_lm_model.resize_token_embeddings(len(tokenizer)) - # if not args.original_model == args.adapted_model: + # if not args.original_model == args.adapted_model_dir: # causal_lm_model.transformer.wte = wte # causal_lm_model.transformer.wpe = wpe if "tr5b-1B3" in args.original_model: # previous 1.3B bigsience model - token_embedding = torch.load(f'{args.adapted_model}/embedding_wte.pt') - add_embedding = torch.load(f'{args.adapted_model}/embedding_wpe.pt') + token_embedding = torch.load(f'{args.adapted_model_dir}/embedding_wte.pt') + add_embedding = torch.load(f'{args.adapted_model_dir}/embedding_wpe.pt') model.transformer.wte = token_embedding model.transformer.wpe = add_embedding elif "bloom" in args.original_model: - token_embedding = torch.load(f'{args.adapted_model}/word_embeddings.pt') - add_embedding = torch.load(f'{args.adapted_model}/word_embeddings_layernorm.pt') + token_embedding = torch.load(f'{args.adapted_model_dir}/word_embeddings.pt') + add_embedding = torch.load(f'{args.adapted_model_dir}/word_embeddings_layernorm.pt') model.transformer.word_embeddings = token_embedding model.transformer.word_embeddings_layernorm = add_embedding - + return model def load_language_adapters(args, model): @@ -358,12 +358,13 @@ def load_language_adapters(args, model): pad_token_id = en_tokenizer.pad_token_id if (not inference and args.cross_lingual) else tokenizer.pad_token_id model = model_class_mapping[args.dataset].from_pretrained(args.original_model, - pad_token_id=pad_token_id, - cache_dir=args.cache_dir, - revision=args.revision, - **optional_model_kwargs) + pad_token_id=pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision, + **optional_model_kwargs) - # baseline: only need to add task-specific adapters + # baseline: only need to add task-specific adapters + # (keeps separated for now for easier debugging) if args.baseline: model = load_task_specific_adapters(args, model, inference) return model @@ -371,52 +372,11 @@ def load_language_adapters(args, model): # adapted models if not args.cross_lingual or inference: model = load_embedding_layers(args, tokenizer, model) - model = load_language_adapters(args, model) + if args.madx_lang_adapter: + model = load_language_adapters(args, model) model = load_task_specific_adapters(args, model, inference) return model - - - # load embedding layers - if not args.cross_lingual or inference: - print(model.transformer.wte.weight) - model.transformer.wte = wte - print(model.transformer.wte.weight) - assert False - # need to load embedding/adapters from the model adapted to the new language - - if args.madx_lang_adapter: - adapter_name = causal_lm_model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") - model.transformer = causal_lm_model.transformer - model.set_active_adapters(adapter_name) - - - if not inference: - #if not args.cross_lingual: normally need to add adapter in any case - # normally this is already done, why use adapter_lang_name here? - #if args.madx_lang_adapter: - # adapter_name = model.load_adapter(args.madx_lang_adapter, - # config="pfeiffer+inv", - # load_as=args.adapter_lang_name) - model.add_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") - model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") - - else: - #if args.madx_lang_adapter: - assert args.pretrained_adapters_dir - # normally this is done in any case - #adapter_name = model.load_adapter(args.madx_lang_adapter) - #model.set_active_adapters(adapter_name) - adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset}-task-adapter") - model.set_active_adapters(adapter_name) - #else: - # # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/013/xnli_de_de_100K_adpt_16_0shot/checkpoint-24544/xnli-task-adapter") - # # not sure what happens here - # # for TGT -> TGT supervised finetuning setting, change adapter_name - # adapter_name = model.load_adapter("/users/zyong2/data/zyong2/bigscience/data/processed/exp-013/task_xnli_de_ft_100000_ori/checkpoint-24544/xnli-task-adapter") - # model.set_active_adapters(adapter_name) - print(model) - return model if args.do_train: From 8a8b9caba7c021eb2f6d21d35ac097fef253d823 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 21 Jun 2022 07:25:02 -0400 Subject: [PATCH 093/142] remove legacy code adapter_lang_name --- scripts/eval/eval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index b5bf265..1853b02 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -61,8 +61,6 @@ parser.add_argument("--local_rank", type=int) parser.add_argument("--madx_lang_adapter", default=None) -parser.add_argument("--no_task_adapter", default=False, action="store_true") -# parser.add_argument("--adapter_lang_name", required=True) parser.add_argument("--baseline", default=False, action="store_true") parser.add_argument("--deepspeed", required=False) From 7eca3995ecda813d79000c6b27faca38e870314a Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 22 Jun 2022 06:43:37 -0400 Subject: [PATCH 094/142] add wikiann --- scripts/eval/eval.py | 58 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 1853b02..2e85844 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -29,6 +29,7 @@ # AVAILABLE TASKS XNLI = "xnli" XLSUM = "csebuetnlp/xlsum" +WIKIANN = "wikiann" # parser parser = argparse.ArgumentParser() @@ -36,7 +37,7 @@ parser.add_argument("--train_lang", type=str) parser.add_argument("--lang", type=str) #xlsum requires a language name, not language code -tasks = [XNLI, XLSUM] +tasks = [XNLI, XLSUM, WIKIANN] parser.add_argument("--dataset", choices=tasks, required=True) parser.add_argument("--cache_dir") @@ -67,11 +68,12 @@ # mapping of tasks to model/trainer classes model_class_mapping = { XNLI: AutoModelForSequenceClassification, - XLSUM: AutoModelWithLMHead + XLSUM: AutoModelWithLMHead, + WIKIANN: AutoModelForTokenClassification } -trainer_no_task_adpt_class_mapping = {XNLI: Trainer, XLSUM: Seq2SeqTrainer} -trainer_class_mapping = {XNLI: AdapterTrainer, XLSUM: Seq2SeqAdapterTrainer} -trainer_args_mapping = {XNLI: TrainingArguments, XLSUM: Seq2SeqTrainingArguments} +trainer_no_task_adpt_class_mapping = {XNLI: Trainer, XLSUM: Seq2SeqTrainer, WIKIANN: Trainer} +trainer_class_mapping = {XNLI: AdapterTrainer, XLSUM: Seq2SeqAdapterTrainer, WIKIANN: AdapterTrainer} +trainer_args_mapping = {XNLI: TrainingArguments, XLSUM: Seq2SeqTrainingArguments, WIKIANN: TrainingArguments} args = parser.parse_args() @@ -91,6 +93,8 @@ optional_eval_args = {} if args.dataset == XNLI: optional_model_kwargs = {"num_labels": 3} +elif args.dataset == WIKIANN: + optional_model_kwargs = {"num_labels": 7} elif args.dataset == XLSUM: optional_trainer_args = {"generation_max_length": 512 + 64, "predict_with_generate":True, @@ -192,6 +196,28 @@ def en_tokenize_function(example): return inputs +elif args.dataset == WIKIANN: + def tokenize_function(examples): + tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, max_length=128, padding="max_length", truncation=True) + + word_ids = tokenized_inputs.word_ids() # Map tokens to their respective word. + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: # Set the special tokens to -100. + if word_idx is None: + label_ids.append(-100) + elif word_idx != previous_word_idx: # Only label the first token of a given word. + label_ids.append(examples[f"ner_tags"][word_idx]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + + tokenized_inputs["labels"] = label_ids + return tokenized_inputs + + def en_tokenize_function(examples): + return en_tokenizer(examples['tokens'], is_split_into_words=True, max_length=128, padding="max_length", truncation=True) + # tokenizing the dataset logger.info("Tokenizing the dataset...") @@ -220,6 +246,28 @@ def compute_metrics(eval_pred): predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) +elif args.dataset == WIKIANN: + metric = load_metric("seqeval") + idx2labelname = {i: label for i, label in enumerate(dataset["train"].features[f"ner_tags"].feature.names)} + + def compute_metrics(eval_pred): + logits, golds = eval_pred + predictions = np.argmax(logits, axis=-1) + + converted_golds = list() + converted_preds = list() + + for i in range(golds.shape[0]): + gold, pred = list(), list() + for j in range(golds.shape[1]): + if golds[i][j] != -100: + gold.append(idx2labelname[golds[i][j]]) + pred.append(idx2labelname[predictions[i][j]]) + converted_golds.append(gold) + converted_preds.append(pred) + + return metric.compute(predictions=converted_preds, references=converted_golds) + elif args.dataset == XLSUM: metric = load_metric('rouge') From 088eb5a852d98d01f8e3dbbc53c6d360f56bd34d Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 22 Jun 2022 08:31:48 -0400 Subject: [PATCH 095/142] scripts for wikiann --- scripts/eval/wikiann/adpt_wikiann_de.sh | 67 +++++++++++++++++++++ scripts/eval/wikiann/baseline_wikiann_de.sh | 63 +++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 scripts/eval/wikiann/adpt_wikiann_de.sh create mode 100644 scripts/eval/wikiann/baseline_wikiann_de.sh diff --git a/scripts/eval/wikiann/adpt_wikiann_de.sh b/scripts/eval/wikiann/adpt_wikiann_de.sh new file mode 100644 index 0000000..10772dc --- /dev/null +++ b/scripts/eval/wikiann/adpt_wikiann_de.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 +#SBATCH --array=100 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-021-wikiann-adpt_wikiann_de + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/adpt_wikiann_de.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/adpt_wikiann_de.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_try_lang_adapter/bin/activate + + +OUTPUT_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/021-wikiann/bloom-1b3-adpt-de" # where you want to save checkpoints at +LANG="de" +CACHE_DIR="/users/zyong2/data/zyong2/huggingface" # cache dir for saving/loading HF models and wikiann datasets. + +LR=1e-4 + +BIGS_MODEL="bigscience/bloom-1b3" +ADAPTER_MODEL_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/020/bloom-1b3_de_emb-and-adpt_1000samples" +TOKENIZER_NAME="bigscience/bloom-1b3" +MADX="/users/zyong2/data/zyong2/bigscience/data/processed/020/bloom-1b3_de_emb-and-adpt_1000samples/oscar_pfeiffer+inv_de" + +# task-specific arguments +TASK_DATASET="wikiann" + +mkdir -p $OUTPUT_DIR + +python /users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/scripts/eval/eval.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--dataset $TASK_DATASET \ +--num_train_epochs 2 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 1 \ +--original_model $BIGS_MODEL \ +--adapted_model_dir $ADAPTER_MODEL_DIR \ +--madx_lang_adapter $MADX \ +--tokenizer $TOKENIZER_NAME \ +--do_predict \ +--use_partial_data \ +--use_partial_train_data 100 \ +--use_partial_val_data 100 \ +--use_partial_test_data 100 + diff --git a/scripts/eval/wikiann/baseline_wikiann_de.sh b/scripts/eval/wikiann/baseline_wikiann_de.sh new file mode 100644 index 0000000..04b6a1c --- /dev/null +++ b/scripts/eval/wikiann/baseline_wikiann_de.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 +#SBATCH --array=100 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-021-wikiann-baseline_wikiann_de + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/baseline_wikiann_de.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/baseline_wikiann_de.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_try_lang_adapter/bin/activate + + +OUTPUT_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/021-wikiann/bloom-1b3-baseline-de" # where you want to save checkpoints at +LANG="de" +CACHE_DIR="/users/zyong2/data/zyong2/huggingface" # cache dir for saving/loading HF models and wikiann datasets. + +LR=1e-4 + +BIGS_MODEL="bigscience/bloom-1b3" +MODEL_NAME="bigscience/bloom-1b3" +TOKENIZER_NAME="bigscience/bloom-1b3" + +# task-specific arguments +TASK_DATASET="wikiann" + +mkdir -p $OUTPUT_DIR + +python /users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/scripts/eval/eval.py \ +$OUTPUT_DIR \ +--lang $LANG \ +--cache_dir $CACHE_DIR \ +--dataset $TASK_DATASET \ +--num_train_epochs 100 \ +--learning_rate $LR \ +--per_device_train_batch_size 8 \ +--gradient_accumulation_steps 1 \ +--original_model $BIGS_MODEL \ +--adapted_model_dir $MODEL_NAME \ +--tokenizer $TOKENIZER_NAME \ +--do_train \ +--do_predict \ +--baseline + From 790c561fbf3aa7aa97854b5cfa22d06d78708871 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 22 Jun 2022 08:37:02 -0400 Subject: [PATCH 096/142] add do_train --- scripts/eval/wikiann/adpt_wikiann_de.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/eval/wikiann/adpt_wikiann_de.sh b/scripts/eval/wikiann/adpt_wikiann_de.sh index 10772dc..74ddfd8 100644 --- a/scripts/eval/wikiann/adpt_wikiann_de.sh +++ b/scripts/eval/wikiann/adpt_wikiann_de.sh @@ -59,9 +59,5 @@ $OUTPUT_DIR \ --adapted_model_dir $ADAPTER_MODEL_DIR \ --madx_lang_adapter $MADX \ --tokenizer $TOKENIZER_NAME \ ---do_predict \ ---use_partial_data \ ---use_partial_train_data 100 \ ---use_partial_val_data 100 \ ---use_partial_test_data 100 - +--do_train \ +--do_predict From 1f7119bc198c49d80c749674a7aedb39f0a9cb6a Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 23 Jun 2022 04:21:47 -0400 Subject: [PATCH 097/142] add comment about tokenizer training --- scripts/lang_adapt/tokenized4clm_sampled.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index a1c4c87..8569dd0 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -31,7 +31,6 @@ parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) parser.add_argument('--vocab_size', default=130_000, type=int) parser.add_argument('--extend_vocab', action='store_true') -# parser.add_argument('--replace_with_overlap', action='store_true') parser.add_argument('--sample_size', default=None, type=int) parser.add_argument("--use_auth_token", default=False, action="store_true") @@ -68,26 +67,17 @@ def batch_iterator(): model_name = pathlib.Path(args.model).parts[-1] if args.extend_vocab: - # FIXME: needs to work on loading the original tokenizer. - tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/') + # Yong: have checked that added tokens would have indices after the original vocab size. + tokenizer = AutoTokenizer.from_pretrained(args.model) assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("✅ Trained tokenizer with len ", len(new_tokenizer)) added = tokenizer.add_tokens([tok for tok in new_tokenizer.vocab.keys()]) + print([tok for tok in new_tokenizer.vocab.keys()]) print(f"Overlap with previous vocab: {args.vocab_size - added}") tokenizer.save_pretrained(f"{args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_extend") print(f"Saved tokenizer to {args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_extend") -# elif args.replace_with_overlap: -# # -# tokenizer = AutoTokenizer.from_pretrained('/tmp-network/user/vnikouli/Projects/bigscience/multilingual-modeling/scripts/exp-009/tr5b-1B3-multilingual-alpha-checkpoints/', unk_token="") - -# assert tokenizer.is_fast -# new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) -# print("✅ Trained tokenizer with len ", len(new_tokenizer)) -# new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") -# print(f"Saved tokenizer to {args.tokenizer_dir}/{lang}_oscar_{args.sample_size}_tokenizer_{args.vocab_size}_overlap") - else: tokenizer = AutoTokenizer.from_pretrained(args.model, use_auth_token=args.use_auth_token) assert tokenizer.is_fast From 2be6fb53e5bf7c9d6ea79fc3e126d50e2c0b251f Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 23 Jun 2022 06:58:28 -0400 Subject: [PATCH 098/142] use register hook to update subelements --- scripts/lang_adapt/madx_run_clm.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 9024b13..567acb2 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -533,6 +533,18 @@ def get_adapter_config(adapter_args, model_args): elif model_args.embedding_strategies == "replace": model.resize_token_embeddings(len(tokenizer)) model.tie_weights() + + elif model_args.embedding_strategies == "extend": + original_embedding_layer = model.get_input_embeddings() + original_vocab_size = original_embedding_layer.weight.shape[0] + model.resize_token_embeddings(len(tokenizer)) + + embedding_layer = model.get_input_embeddings() + gradient_mask = torch.zeros(*embedding_layer.weight.shape) + gradient_mask[original_vocab_size:, :] = 1.0 # only finetune extended vocab + gradient_mask = gradient_mask.to(model.device) + embedding_layer.weight.register_hook(lambda grad: grad.mul_(gradient_mask)) + #if model_args.embedding_strategies == "overlap-replace": # if not tokenizer.name_or_path == model_args.model_name_or_path: # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) @@ -654,6 +666,8 @@ def main(): print("Model: 👇") print(model) + + # print(model.get_input_embeddings().weight) # get original weight for embedding layer # Training if training_args.do_train: checkpoint = None @@ -688,6 +702,9 @@ def main(): trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() + + # print(model.get_input_embeddings().weight) # get updated weight + # Evaluation if training_args.do_eval: From 5412d2ef5de17353380c7ec1e653754293675814 Mon Sep 17 00:00:00 2001 From: haileyschoelkopf Date: Thu, 23 Jun 2022 11:53:53 -0400 Subject: [PATCH 099/142] don't use mask to zero out grad --- scripts/lang_adapt/madx_run_clm.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 567acb2..bf98093 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -538,12 +538,15 @@ def get_adapter_config(adapter_args, model_args): original_embedding_layer = model.get_input_embeddings() original_vocab_size = original_embedding_layer.weight.shape[0] model.resize_token_embeddings(len(tokenizer)) - + model.tie_weights() + embedding_layer = model.get_input_embeddings() - gradient_mask = torch.zeros(*embedding_layer.weight.shape) - gradient_mask[original_vocab_size:, :] = 1.0 # only finetune extended vocab - gradient_mask = gradient_mask.to(model.device) - embedding_layer.weight.register_hook(lambda grad: grad.mul_(gradient_mask)) + # erases gradients for the original embedding layer, without using extra CUDA memory + def zero_grad(grad): + grad[:original_vocab_size, :] = 0 + return grad + + embedding_layer.weight.register_hook(lambda grad: zero_grad(grad)) #if model_args.embedding_strategies == "overlap-replace": # if not tokenizer.name_or_path == model_args.model_name_or_path: @@ -667,7 +670,9 @@ def main(): print("Model: 👇") print(model) - # print(model.get_input_embeddings().weight) # get original weight for embedding layer + + # print("Embeddings at start of run:", model.get_input_embeddings().weight[250880:,:]) # get original weight for embedding layer + # orig_embeddings = model.get_input_embeddings().weight.detach().clone() # clone original weight for embedding layer # Training if training_args.do_train: checkpoint = None @@ -703,7 +708,16 @@ def main(): trainer.save_metrics("train", metrics) trainer.save_state() - # print(model.get_input_embeddings().weight) # get updated weight + # uncomment to test whether extending vocab gradient masking is working correctly. + # if model_args.embedding_strategies == "extend": + # print("Unsliced, post-training:", model.get_input_embeddings().weight) # get updated weight + # if not torch.equal(orig_embeddings[:250880, :], model.get_input_embeddings().weight[:250880, :]): + # raise ValueError("embedding layer is updated where it shouldn't....") + + # if torch.equal(orig_embeddings[250880:, :], model.get_input_embeddings().weight[250880:, :]): + # print("original embeddings:", orig_embeddings[250880:, :]) + # print("updated embeddings:", model.get_input_embeddings().weight[250880:, :]) + # raise ValueError("embedding layer is not updated where it should....") # Evaluation From 1bbc8d50b544b1c39aec73265134aa3c448937a4 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 25 Jun 2022 16:05:31 +0700 Subject: [PATCH 100/142] added argument for enabling finetuning mode --- scripts/lang_adapt/madx_run_clm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 9024b13..4a8a0e5 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -111,6 +111,10 @@ class ModelArguments: default="", metadata={"help": "choose one of the two strategies - 'replace', 'extend', 'overlap-replace'"}, ) + finetuning_strategies: str = field( + default="full", + metadata={"help": "choose one of the three strategies - 'full', 'bitfit'"}, + ) adapter_placement: str = field( default="all", metadata={"help": "list of layers where to place the adapters: all: use all layers, '17,24': list layers id separated by ','"}, From 7042e628036069b3d939e22283b6e97b98a30257 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 25 Jun 2022 16:11:56 +0700 Subject: [PATCH 101/142] added logic to freeze all param except bias if bitfit is set to true --- scripts/lang_adapt/madx_run_clm.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 4a8a0e5..ee919af 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -547,6 +547,16 @@ def get_adapter_config(adapter_args, model_args): #elif model_args.embedding_strategies == "replace": # model.resize_token_embeddings(len(tokenizer)) + print(f"✅ Use Finetuning Strategy: {model_args.finetuning_strategies}") + + if model_args.finetuning_strategies == "full": + # No modification needed + pass + elif model_args.finetuning_strategies == "bitfit": + for name, param in model.base_model.named_parameters(): + if 'bias' not in name: + param.requires_grad = False + trainable_params = 0 frozen_params = 0 emb_params = 0 @@ -568,6 +578,7 @@ def get_adapter_config(adapter_args, model_args): print(f"Total frozen parameters: {frozen_params}") print(f"Total emb parameters (wte, wpe): {emb_params}") print(f"Total trainable parameters: {trainable_params}") + def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. From 2ab63ff2914771dd08bd2129882e2076109a47fa Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 25 Jun 2022 16:51:24 +0700 Subject: [PATCH 102/142] switch order --- scripts/lang_adapt/madx_run_clm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index ee919af..33747af 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -549,13 +549,13 @@ def get_adapter_config(adapter_args, model_args): print(f"✅ Use Finetuning Strategy: {model_args.finetuning_strategies}") - if model_args.finetuning_strategies == "full": - # No modification needed - pass - elif model_args.finetuning_strategies == "bitfit": + if model_args.finetuning_strategies == "bitfit": for name, param in model.base_model.named_parameters(): if 'bias' not in name: param.requires_grad = False + elif model_args.finetuning_strategies == "full": + # No modification needed + pass trainable_params = 0 frozen_params = 0 From a7f7df57e8152c43a731e27a3f2ae6f7984c610c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 26 Jun 2022 11:36:59 +0700 Subject: [PATCH 103/142] added script --- scripts/lang_adapt/run_clm_bitfit_my.sh | 81 +++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 scripts/lang_adapt/run_clm_bitfit_my.sh diff --git a/scripts/lang_adapt/run_clm_bitfit_my.sh b/scripts/lang_adapt/run_clm_bitfit_my.sh new file mode 100644 index 0000000..1e0dec6 --- /dev/null +++ b/scripts/lang_adapt/run_clm_bitfit_my.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 +#SBATCH --array=1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=4 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + +# Specify a job name: +#SBATCH -J exp-020-run_clm_emb + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_emb_%a.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_emb_%a.err + +# Set up the environment by loading modules +set -a # automatically export all variables +source ~/.env +set +a + +module load python/3.7.4 +module load gitlfs/2.7.1 +source $FP_BIGS/env_try_lang_adapter/bin/activate + + +# axis +LANG="my" +DATA_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) +VOCAB_SIZE=5000 +CH=118500 +BIGS_MODEL="bigscience/bloom-1b3" +FTNE_STRATEGY="bitfit" +EMBD_SRATEGY="overlap-replace" + +tokenizer_dir="bigscience/bloom-1b3" #"/users/zyong2/data/zyong2/bigscience/data/processed/020/tok_${BIGS_MODEL##*/}_${LANG}_oscar_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" +cache_dir="checkpoint/cache/" +output_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${FTNE_STRATEGY}_${EMBD_SRATEGY}_${DATA_SAMPLES}samples" +logging_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${FTNE_STRATEGY}_${EMBD_SRATEGY}_${DATA_SAMPLES}samples" + +mkdir -p $output_dir +mkdir -p $logging_dir + +python scripts/lang_adapt/madx_run_clm.py \ + --seed 0 \ + --fp16 \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_${LANG}" \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "epoch" \ + --max_eval_samples 5000 \ + --save_steps 10000 \ + --save_strategy "epoch" \ + --save_total_limit 3 \ + --max_train_samples ${data_sample}\ + --max_steps 50000 \ + --load_best_model_at_end \ + --finetuning_strategies $FTNE_STRATEGY \ + --embedding_strategies $EMBD_SRATEGY \ + --language $LANG &> $output_dir/train.log From 729bd67cea5778970a09bac0380487038de5a9cd Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 26 Jun 2022 13:10:04 +0700 Subject: [PATCH 104/142] minor changes in script --- scripts/lang_adapt/run_clm_bitfit_my.sh | 42 +++++-------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/scripts/lang_adapt/run_clm_bitfit_my.sh b/scripts/lang_adapt/run_clm_bitfit_my.sh index 1e0dec6..51b0cbe 100644 --- a/scripts/lang_adapt/run_clm_bitfit_my.sh +++ b/scripts/lang_adapt/run_clm_bitfit_my.sh @@ -1,48 +1,19 @@ #!/bin/bash -# Request half an hour of runtime: -#SBATCH --time=2-23:59:00 - -# Ask for the GPU partition and 1 GPU -#SBATCH --partition=gpu-he --gres=gpu:1 -#SBATCH --array=1 - -# Default resources are 1 core with 2.8GB of memory. -#SBATCH --ntasks=4 - -# Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g - -# Specify a job name: -#SBATCH -J exp-020-run_clm_emb - -# Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_emb_%a.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-020/run_clm_emb_%a.err - -# Set up the environment by loading modules -set -a # automatically export all variables -source ~/.env -set +a - -module load python/3.7.4 -module load gitlfs/2.7.1 -source $FP_BIGS/env_try_lang_adapter/bin/activate - - # axis LANG="my" DATA_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) VOCAB_SIZE=5000 CH=118500 BIGS_MODEL="bigscience/bloom-1b3" -FTNE_STRATEGY="bitfit" +ADPT_STRATEGY="emb-and-adpt" EMBD_SRATEGY="overlap-replace" +FTNE_STRATEGY="bitfit" tokenizer_dir="bigscience/bloom-1b3" #"/users/zyong2/data/zyong2/bigscience/data/processed/020/tok_${BIGS_MODEL##*/}_${LANG}_oscar_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" cache_dir="checkpoint/cache/" -output_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${FTNE_STRATEGY}_${EMBD_SRATEGY}_${DATA_SAMPLES}samples" -logging_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${FTNE_STRATEGY}_${EMBD_SRATEGY}_${DATA_SAMPLES}samples" +output_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" +logging_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" mkdir -p $output_dir mkdir -p $logging_dir @@ -73,9 +44,10 @@ python scripts/lang_adapt/madx_run_clm.py \ --save_steps 10000 \ --save_strategy "epoch" \ --save_total_limit 3 \ - --max_train_samples ${data_sample}\ + --max_train_samples ${DATA_SAMPLES}\ --max_steps 50000 \ --load_best_model_at_end \ - --finetuning_strategies $FTNE_STRATEGY \ + --lang_adapt_strategies $ADPT_STRATEGY \ --embedding_strategies $EMBD_SRATEGY \ + --finetuning_strategies $FTNE_STRATEGY \ --language $LANG &> $output_dir/train.log From a63ccb6f368772da2c1554202a606303dd732d46 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 26 Jun 2022 13:11:16 +0700 Subject: [PATCH 105/142] hardcoded DATA_SAMPLES --- scripts/lang_adapt/run_clm_bitfit_my.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/run_clm_bitfit_my.sh b/scripts/lang_adapt/run_clm_bitfit_my.sh index 51b0cbe..e93c490 100644 --- a/scripts/lang_adapt/run_clm_bitfit_my.sh +++ b/scripts/lang_adapt/run_clm_bitfit_my.sh @@ -2,7 +2,7 @@ # axis LANG="my" -DATA_SAMPLES=$(($SLURM_ARRAY_TASK_ID * 1000)) +DATA_SAMPLES=100000 #$(($SLURM_ARRAY_TASK_ID * 1000)) VOCAB_SIZE=5000 CH=118500 BIGS_MODEL="bigscience/bloom-1b3" From 2fe1d408e3ef8f073f310ae87019a32d68821475 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 26 Jun 2022 19:52:09 +0700 Subject: [PATCH 106/142] changed lline from freezing base_model to transformer --- scripts/lang_adapt/madx_run_clm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 33747af..8db44b6 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -550,7 +550,7 @@ def get_adapter_config(adapter_args, model_args): print(f"✅ Use Finetuning Strategy: {model_args.finetuning_strategies}") if model_args.finetuning_strategies == "bitfit": - for name, param in model.base_model.named_parameters(): + for name, param in model.transformer.named_parameters(): if 'bias' not in name: param.requires_grad = False elif model_args.finetuning_strategies == "full": From 41f0f024d1021a5c6877e0d19f2b28d4b0acbde2 Mon Sep 17 00:00:00 2001 From: yongzx Date: Sun, 26 Jun 2022 11:06:22 -0400 Subject: [PATCH 107/142] overlap-replace support for BLOOM --- scripts/lang_adapt/madx_run_clm.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 9024b13..356bbe1 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -520,18 +520,36 @@ def get_adapter_config(adapter_args, model_args): if model_args.embedding_strategies == "overlap-replace": if not tokenizer.name_or_path == model_args.model_name_or_path: orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + else: + raise Exception("Same tokenizer so overlap-replace doesn't make sense.") + + if hasattr(model.transformer, "wte"): + # gpt-2 + ref_embedding = model.transformer.wte + elif hasattr(model.transformer, "word_embeddings"): + # bloom + ref_embedding = model.transformer.word_embeddings + else: + raise Exception("Unsupported Model") - ref_embedding = model.transformer.wte model.resize_token_embeddings(len(tokenizer)) overlap = set(tokenizer.vocab).intersection(set(orig_tokenizer.vocab)) + print(len(tokenizer)) + print(f"{len(overlap)} tokens overlapped") curr_vocab = tokenizer.vocab orig_vocab = orig_tokenizer.vocab for t in overlap: - model.transformer.wte.weight.data[curr_vocab[t]] = ref_embedding.weight[orig_vocab[t]] + if hasattr(model.transformer, "wte"): + model.transformer.wte.weight.data[curr_vocab[t]] = ref_embedding.weight[orig_vocab[t]] + elif hasattr(model.transformer, "word_embeddings"): + model.transformer.word_embeddings.weight.data[curr_vocab[t]] = ref_embedding.weight[orig_vocab[t]] + else: + raise Exception("Unsupported Model") model.tie_weights() elif model_args.embedding_strategies == "replace": model.resize_token_embeddings(len(tokenizer)) + print(len(tokenizer)) model.tie_weights() #if model_args.embedding_strategies == "overlap-replace": # if not tokenizer.name_or_path == model_args.model_name_or_path: From da5bdd36d9c7b7e4983432709589aa7861c9c081 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 27 Jun 2022 11:48:06 +0700 Subject: [PATCH 108/142] added script to calculate bias changes, wip --- scripts/lang_adapt/calculate_bias_changes.py | 49 ++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 scripts/lang_adapt/calculate_bias_changes.py diff --git a/scripts/lang_adapt/calculate_bias_changes.py b/scripts/lang_adapt/calculate_bias_changes.py new file mode 100644 index 0000000..fdb72c0 --- /dev/null +++ b/scripts/lang_adapt/calculate_bias_changes.py @@ -0,0 +1,49 @@ +from transformers import AutoModel + +model_0 = AutoModel.from_pretrained() +model_F = AutoModel.from_pretrained() + + +for (name_0, param_0), (name_F, param_F) in zip(model_0.named_parameters(), model_F.named_parameters()): + + param_name = + if "bias" in name_0: + if "query_key_value": + + # Query, Key, Value are merged in one MLP, + # so we need to seperate the bias terms + + head_size = model_0.config.hidden_size // model_0.config.num_attention_heads + + _q_change = None + _k_change = None + _v_change = None + for qkv_bias in [param_0, param_F]: + qkv_bias = qkv_bias.view(num_attention_heads, 3*head_size) + + if _q_change is None: + _q_change = qkv_bias[..., :head_size] + else: + _q_change -= qkv_bias[..., :head_size] + _q_change = torch.norm(_q_change) + + if _k_change is None: + _k_change = qkv_bias[..., head_size: 2 * head_size] + else: + _k_change -= qkv_bias[..., head_size: 2 * head_size] + _k_change = torch.norm(_k_change) + + if _v_change is None: + _v_change = qkv_bias[..., 2 * head_size:] + else: + _v_change -= qkv_bias[..., 2 * head_size:] + _v_change = torch.norm(_v_change) + else: + bias_change = torch.norm(param_0 - param_F) + +transformer.h.0.input_layernorm.bias +transformer.h.0.self_attention.query_key_value.bias +transformer.h.0.self_attention.dense.bias +transformer.h.0.post_attention_layernorm.bias +transformer.h.0.mlp.dense_h_to_4h.bias +transformer.h.0.mlp.dense_4h_to_h.bias \ No newline at end of file From 84d0880f818c977a846201ff393a0099b623469d Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 28 Jun 2022 01:36:18 -0400 Subject: [PATCH 109/142] support for WikiANN --- scripts/eval/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 2e85844..0c3128f 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -152,11 +152,11 @@ # load tokenizer logger.info("Loading tokenizer...") -tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision) +tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) tokenizer.pad_token = tokenizer.eos_token # TODO: we probably need better code for this than multiple if-else statements -en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision) +en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) en_tokenizer.pad_token = en_tokenizer.eos_token if args.dataset == XNLI: From de9ae89959b49ef8cea37941b6d7f13b56f532b6 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 28 Jun 2022 01:37:09 -0400 Subject: [PATCH 110/142] replace seed magic number with args.seed --- scripts/lang_adapt/tokenized4clm_sampled.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 8569dd0..a221e87 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -33,6 +33,7 @@ parser.add_argument('--extend_vocab', action='store_true') parser.add_argument('--sample_size', default=None, type=int) parser.add_argument("--use_auth_token", default=False, action="store_true") +parser.add_argument("--seed", default=42, type=int) args = parser.parse_args() lang = args.lang @@ -44,7 +45,7 @@ "oscar", f"unshuffled_deduplicated_{lang}", cache_dir=args.hf_cache_dir - )["train"].shuffle(seed=42).select(range(args.sample_size)) + )["train"].shuffle(seed=args.seed).select(range(args.sample_size)) else: raw_datasets = load_dataset( From 95240654f39ae3f62ab63173cd443e7776bd9a21 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 28 Jun 2022 01:38:07 -0400 Subject: [PATCH 111/142] fix unintended bugs arising from cached tokenized data --- scripts/lang_adapt/madx_run_clm.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 2f5cf5c..c71a2e9 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -320,9 +320,9 @@ def preprocess_data(training_args, data_args, model_args, tokenizer): with training_args.main_process_first(desc="dataset map tokenization"): # cache tokenized data base_cache_dir = f"{model_args.cache_dir}/{data_args.dataset_name}/{data_args.dataset_config_name}" - saved_tokenized_datasets_fp = pathlib.Path(f"{base_cache_dir}/tokenized_data_{data_args.max_train_samples}train_{data_args.max_eval_samples}eval.pt") + saved_tokenized_datasets_fp = pathlib.Path(f"{base_cache_dir}/tokenized_data_{data_args.max_train_samples}train_{data_args.max_eval_samples}eval_{len(tokenizer)}vocab.pt") - if saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): + if not data_args.overwrite_cache and saved_tokenized_datasets_fp.exists() and saved_tokenized_datasets_fp.is_file(): tokenized_datasets = torch.load(str(saved_tokenized_datasets_fp)) logger.info(f"✅ loaded tokenized_data from {saved_tokenized_datasets_fp}") else: @@ -414,9 +414,9 @@ def group_texts(examples): with training_args.main_process_first(desc="grouping texts together"): base_cache_dir = f"{model_args.cache_dir}/{data_args.dataset_name}/{data_args.dataset_config_name}" - saved_lm_datasets_fp = pathlib.Path(f"{base_cache_dir}/lm_data_{data_args.max_train_samples}train_{data_args.max_eval_samples}eval.pt") + saved_lm_datasets_fp = pathlib.Path(f"{base_cache_dir}/lm_data_{data_args.max_train_samples}train_{data_args.max_eval_samples}eval_{len(tokenizer)}vocab.pt") - if saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): + if not data_args.overwrite_cache and saved_lm_datasets_fp.exists() and saved_lm_datasets_fp.is_file(): lm_datasets = torch.load(str(saved_lm_datasets_fp)) logger.info(f"✅ loaded lm_data from {saved_lm_datasets_fp}") else: @@ -534,7 +534,6 @@ def get_adapter_config(adapter_args, model_args): model.resize_token_embeddings(len(tokenizer)) overlap = set(tokenizer.vocab).intersection(set(orig_tokenizer.vocab)) - print(len(tokenizer)) print(f"{len(overlap)} tokens overlapped") curr_vocab = tokenizer.vocab orig_vocab = orig_tokenizer.vocab @@ -549,7 +548,6 @@ def get_adapter_config(adapter_args, model_args): elif model_args.embedding_strategies == "replace": model.resize_token_embeddings(len(tokenizer)) - print(len(tokenizer)) model.tie_weights() elif model_args.embedding_strategies == "extend": @@ -597,6 +595,7 @@ def zero_grad(grad): print(f"Total frozen parameters: {frozen_params}") print(f"Total emb parameters (wte, wpe): {emb_params}") print(f"Total trainable parameters: {trainable_params}") + def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. From c2722a72b740c45ecb02b08c15a783061d0e82c3 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 28 Jun 2022 03:16:08 -0400 Subject: [PATCH 112/142] add last-layer finetuning for tasks --- scripts/eval/eval.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 0c3128f..652885e 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -65,6 +65,10 @@ parser.add_argument("--baseline", default=False, action="store_true") parser.add_argument("--deepspeed", required=False) +task_layers = ["task-adapters", "last-layer"] +parser.add_argument("--task_layers", choices=task_layers, required=True) + + # mapping of tasks to model/trainer classes model_class_mapping = { XNLI: AutoModelForSequenceClassification, @@ -327,7 +331,6 @@ def compute_beam_search_metrics(model, dataset): # labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] # result = metric.compute(predictions=preds, references=labels) - assert False else: raise ValueError("Unknown dataset provided") @@ -364,6 +367,15 @@ def print_model_trainable_layers(model): print(f"🚀 Trainable layer '{name}'") def load_model(args, inference=False): + def make_last_layer_trainable(args, model, inference=False): + for name, param in model.named_parameters(): + # task-specific last layer + if 'transformer' not in name: + param.requires_grad = True + else: + param.requires_grad = False + return model + def load_task_specific_adapters(args, model, inference=False): if not inference: model.add_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") @@ -412,7 +424,10 @@ def load_language_adapters(args, model): # baseline: only need to add task-specific adapters # (keeps separated for now for easier debugging) if args.baseline: - model = load_task_specific_adapters(args, model, inference) + if args.task_layers == "task-adapters": + model = load_task_specific_adapters(args, model, inference) + elif args.task_layers == "last-layer": + model = make_last_layer_trainable(args, model, inference) return model # adapted models @@ -421,7 +436,12 @@ def load_language_adapters(args, model): if args.madx_lang_adapter: model = load_language_adapters(args, model) - model = load_task_specific_adapters(args, model, inference) + if args.task_layers == "task-adapters": + model = load_task_specific_adapters(args, model, inference) + elif args.task_layers == "last-layer": + model = make_last_layer_trainable(args, model, inference) + print_model_trainable_layers(model) + assert False return model From c9b777313fa726d81df51ef101b75778a6004fb4 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 28 Jun 2022 03:29:19 -0400 Subject: [PATCH 113/142] remove assert False --- scripts/eval/eval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 652885e..9cb7123 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -440,8 +440,6 @@ def load_language_adapters(args, model): model = load_task_specific_adapters(args, model, inference) elif args.task_layers == "last-layer": model = make_last_layer_trainable(args, model, inference) - print_model_trainable_layers(model) - assert False return model From e1079c130f465b9ec59623012207a45a79a8f581 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 28 Jun 2022 03:54:39 -0400 Subject: [PATCH 114/142] supprt BERT training --- scripts/eval/eval.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 9cb7123..afe7b88 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -157,13 +157,21 @@ # load tokenizer logger.info("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) -tokenizer.pad_token = tokenizer.eos_token +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + # TODO: we probably need better code for this than multiple if-else statements en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) -en_tokenizer.pad_token = en_tokenizer.eos_token +if en_tokenizer.pad_token is None: + en_tokenizer.pad_token = en_tokenizer.eos_token if args.dataset == XNLI: + if tokenizer.eos_token is None: + tokenizer.eos_token = tokenizer.sep_token + if en_tokenizer.eos_token is None: + en_tokenizer.eos_token = en_tokenizer.sep_token + def tokenize_function(examples): return tokenizer(f'{examples["premise"]} {tokenizer.eos_token} {examples["hypothesis"]}', max_length=128, padding="max_length", truncation=True) @@ -368,12 +376,7 @@ def print_model_trainable_layers(model): def load_model(args, inference=False): def make_last_layer_trainable(args, model, inference=False): - for name, param in model.named_parameters(): - # task-specific last layer - if 'transformer' not in name: - param.requires_grad = True - else: - param.requires_grad = False + model.freeze_model(freeze=True) return model def load_task_specific_adapters(args, model, inference=False): @@ -457,8 +460,14 @@ def load_language_adapters(args, model): label_pad_token_id=-100, ) + if model.active_adapters is None: + logger.info("No active adapters") + trainer_class = trainer_no_task_adpt_class_mapping[args.dataset] + else: + trainer_class = trainer_class_mapping[args.dataset] logger.info(f"Using {trainer_class_mapping[args.dataset]} for training") - trainer = trainer_class_mapping[args.dataset]( + + trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset, @@ -500,7 +509,13 @@ def load_language_adapters(args, model): pad_to_multiple_of=8 if training_args.fp16 else None, ) - eval_trainer = trainer_class_mapping[args.dataset]( + if model.active_adapters is None: + logger.info("No active adapters") + trainer_class = trainer_no_task_adpt_class_mapping[args.dataset] + else: + trainer_class = trainer_class_mapping[args.dataset] + + eval_trainer = trainer_class( model=model, args=training_args, eval_dataset=test_dataset, From 7c2b034786f502423fab62c12b3bc6f0a14e52b4 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 28 Jun 2022 18:40:14 +0700 Subject: [PATCH 115/142] moved bitfit scripts --- .../{ => bitfit}/run_clm_bitfit_my.sh | 15 ++++++++------- .../lang_adapt/bitfit/train_tokenizer_update.sh | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 7 deletions(-) rename scripts/lang_adapt/{ => bitfit}/run_clm_bitfit_my.sh (78%) create mode 100644 scripts/lang_adapt/bitfit/train_tokenizer_update.sh diff --git a/scripts/lang_adapt/run_clm_bitfit_my.sh b/scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh similarity index 78% rename from scripts/lang_adapt/run_clm_bitfit_my.sh rename to scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh index e93c490..cfcc3a3 100644 --- a/scripts/lang_adapt/run_clm_bitfit_my.sh +++ b/scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh @@ -5,12 +5,12 @@ LANG="my" DATA_SAMPLES=100000 #$(($SLURM_ARRAY_TASK_ID * 1000)) VOCAB_SIZE=5000 CH=118500 -BIGS_MODEL="bigscience/bloom-1b3" +BIGS_MODEL="bigscience/bloom-350m" ADPT_STRATEGY="emb-and-adpt" -EMBD_SRATEGY="overlap-replace" +EMBD_SRATEGY="extend" FTNE_STRATEGY="bitfit" -tokenizer_dir="bigscience/bloom-1b3" #"/users/zyong2/data/zyong2/bigscience/data/processed/020/tok_${BIGS_MODEL##*/}_${LANG}_oscar_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" +tokenizer_dir="checkpoint/tok_bloom-350m_my_oscar_100000samples_24000vocab_extend" cache_dir="checkpoint/cache/" output_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" logging_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" @@ -28,16 +28,17 @@ python scripts/lang_adapt/madx_run_clm.py \ --dataset_config_name "unshuffled_deduplicated_${LANG}" \ --logging_dir $logging_dir \ --report_to "tensorboard" \ - --learning_rate 0.001 \ + --learning_rate 1e-4 \ + --lr_scheduler_type "constant" \ --do_train \ --do_eval \ --output_dir $output_dir \ --preprocessing_num_workers 8 \ --overwrite_output_dir \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 16 \ --per_device_eval_batch_size 2 \ - --eval_accumulation_steps 4 \ + --eval_accumulation_steps 1 \ --eval_steps 1000 \ --evaluation_strategy "epoch" \ --max_eval_samples 5000 \ diff --git a/scripts/lang_adapt/bitfit/train_tokenizer_update.sh b/scripts/lang_adapt/bitfit/train_tokenizer_update.sh new file mode 100644 index 0000000..47d8fa0 --- /dev/null +++ b/scripts/lang_adapt/bitfit/train_tokenizer_update.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=cpu + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=50g + + + +lng=$1 +model=$2 +tokenizer_dir=$3 +vocab_size=$4 +sample_size=$5 +python tokenized4clm_sampled.py --lang $lng --model $model --tokenizer_dir $tokenizer_dir --vocab_size $vocab_size --sample_size $sample_size --extend_vocab + From 625a43b02fb70d0ea56fa043730034a1d681cee4 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 29 Jun 2022 09:19:21 -0400 Subject: [PATCH 116/142] support LoRA --- scripts/lang_adapt/madx_run_clm.py | 61 ++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index c71a2e9..145aa07 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -34,7 +34,7 @@ set_seed, ) from transformers.adapters.configuration import AdapterConfig -from transformers.adapters import PrefixTuningConfig +from transformers.adapters import PrefixTuningConfig, LoRAConfig from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint @@ -193,6 +193,40 @@ def __post_init__(self): extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." +@dataclass +class ParamEfficientArguments(MultiLingAdapterArguments): + """ + Arguments pertaining to other parameter efficient techniques such as (LoRA, BitFit, etc.) + """ + # lora + selfattn_lora: bool = field( + default=True, + metadata={"help": "If True, add LoRA to the self-attention weights of a model. Defaults to True."}, + ) + intermediate_lora: bool = field( + default=False, + metadata={"help": "If True, add LoRA to the intermediate MLP weights of a model. Defaults to False."}, + ) + output_lora: bool = field( + default=False, + metadata={"help": "If True, add LoRA to the output MLP weights of a model. Defaults to False."}, + ) + r_lora: Optional[int] = field( + default=8, + metadata={"help": "If True, add LoRA to the output MLP weights of a model. Defaults to False."}, + ) + alpha_lora: Optional[int] = field( + default=8, + metadata={"help": "If True, add LoRA to the output MLP weights of a model. Defaults to False."}, + ) + dropout_lora: Optional[float] = field( + default=0.0, + metadata={"help": "If True, add LoRA to the output MLP weights of a model. Defaults to False."}, + ) + init_weights_lora: Optional[str] = field( + default='lora', + metadata={"help": "If True, add LoRA to the output MLP weights of a model. Defaults to False."}, + ) def load_tokenizer(model_args): tokenizer_kwargs = { @@ -433,13 +467,8 @@ def group_texts(examples): return lm_datasets def modify_model(adapter_args, data_args, model_args, tokenizer, model): - #if "emb" in model_args.lang_adapt_strategies: - # if "replace" in model_args.embedding_strategies: - # for name, param in model.named_parameters(): - # if "wte" not in name and "wpe" not in name and "lm_head" not in name: - # param.requires_grad = False - def get_adapter_config(adapter_args, model_args): + # modify here for new parameter efficient techniques associated with adapter-hub if adapter_args.adapter_config == "prefix_tuning": if model_args.adapter_placement == "all": adapter_config = PrefixTuningConfig(bottleneck_size = 800) @@ -449,9 +478,17 @@ def get_adapter_config(adapter_args, model_args): leave_out = [i for i in range(0,24) if not i in adapters2use] ) - + elif adapter_args.adapter_config == "lora": + adapter_config = LoRAConfig( + selfattn_lora = adapter_args.selfattn_lora, + intermediate_lora = adapter_args.intermediate_lora, + output_lora = adapter_args.output_lora, + r = adapter_args.r_lora, + alpha = adapter_args.alpha_lora, + dropout = adapter_args.dropout_lora, + init_weights = adapter_args.init_weights_lora, + ) else: - if model_args.adapter_placement == "all": adapter_config = AdapterConfig.load( adapter_args.adapter_config, @@ -500,8 +537,10 @@ def get_adapter_config(adapter_args, model_args): ) else: lang_adapter_name = None + # Freeze all model weights except of those of this adapter model.train_adapter(task_name, train_embeddings=True) + # Set the adapters to be used in every forward pass #if lang_adapter_name: # model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) @@ -601,7 +640,7 @@ def main(): # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, ParamEfficientArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, @@ -614,7 +653,7 @@ def main(): training_args.data_dir = f'{training_args.output_dir}' - assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') + assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'lora') assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') # Setup logging From 6d7a59b7333df1d0a9c1614433ad6589e44e2103 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Wed, 29 Jun 2022 20:36:57 +0700 Subject: [PATCH 117/142] config changes --- scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh b/scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh index cfcc3a3..dce196c 100644 --- a/scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh +++ b/scripts/lang_adapt/bitfit/run_clm_bitfit_my.sh @@ -10,15 +10,15 @@ ADPT_STRATEGY="emb-and-adpt" EMBD_SRATEGY="extend" FTNE_STRATEGY="bitfit" -tokenizer_dir="checkpoint/tok_bloom-350m_my_oscar_100000samples_24000vocab_extend" +tokenizer_dir="checkpoint/tokenizer_ext_my/" cache_dir="checkpoint/cache/" output_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" -logging_dir="checkpoint/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" +logging_dir="logs/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${EMBD_SRATEGY}_${FTNE_STRATEGY}_${DATA_SAMPLES}samples" mkdir -p $output_dir mkdir -p $logging_dir -python scripts/lang_adapt/madx_run_clm.py \ +python madx_run_clm.py \ --seed 0 \ --fp16 \ --model_name_or_path $BIGS_MODEL \ @@ -27,6 +27,8 @@ python scripts/lang_adapt/madx_run_clm.py \ --cache_dir $cache_dir \ --dataset_config_name "unshuffled_deduplicated_${LANG}" \ --logging_dir $logging_dir \ + --logging_first_step True \ + --logging_steps 8 \ --report_to "tensorboard" \ --learning_rate 1e-4 \ --lr_scheduler_type "constant" \ @@ -35,8 +37,8 @@ python scripts/lang_adapt/madx_run_clm.py \ --output_dir $output_dir \ --preprocessing_num_workers 8 \ --overwrite_output_dir \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 16 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ --per_device_eval_batch_size 2 \ --eval_accumulation_steps 1 \ --eval_steps 1000 \ @@ -46,7 +48,7 @@ python scripts/lang_adapt/madx_run_clm.py \ --save_strategy "epoch" \ --save_total_limit 3 \ --max_train_samples ${DATA_SAMPLES}\ - --max_steps 50000 \ + --max_steps 6250 \ --load_best_model_at_end \ --lang_adapt_strategies $ADPT_STRATEGY \ --embedding_strategies $EMBD_SRATEGY \ From 1fb650454f15e6eb943ba7a0b669555768260787 Mon Sep 17 00:00:00 2001 From: yongzx Date: Wed, 29 Jun 2022 10:50:09 -0400 Subject: [PATCH 118/142] refactor repo --- scripts/{ => archive}/eval/adapters_xnli_de.py | 0 scripts/{ => archive}/eval/adapters_xnli_de_vn.py | 0 scripts/{ => archive}/eval_xnli/adapters_eval.py | 0 scripts/{ => archive}/eval_xnli/adapters_xlsum_de.py | 0 scripts/{ => archive}/eval_xnli/crosslingual_exp.sh | 0 scripts/eval/{ => xnli}/run_eval_xnli_zero_shot.sh | 0 scripts/eval/{ => xnli}/train_xnli_zero_shot.sh | 0 scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh | 2 +- scripts/lang_adapt/{ => scripts}/run_clm_adpt.sh | 0 scripts/lang_adapt/{ => scripts}/run_clm_adpt_vn.sh | 0 scripts/lang_adapt/{ => scripts}/run_clm_emb.sh | 0 scripts/lang_adapt/{ => scripts}/train_tokenizer_scratch.sh | 0 scripts/lang_adapt/{ => scripts}/train_tokenizer_update.sh | 0 13 files changed, 1 insertion(+), 1 deletion(-) rename scripts/{ => archive}/eval/adapters_xnli_de.py (100%) rename scripts/{ => archive}/eval/adapters_xnli_de_vn.py (100%) rename scripts/{ => archive}/eval_xnli/adapters_eval.py (100%) rename scripts/{ => archive}/eval_xnli/adapters_xlsum_de.py (100%) rename scripts/{ => archive}/eval_xnli/crosslingual_exp.sh (100%) rename scripts/eval/{ => xnli}/run_eval_xnli_zero_shot.sh (100%) rename scripts/eval/{ => xnli}/train_xnli_zero_shot.sh (100%) rename scripts/lang_adapt/{ => scripts}/run_clm_adpt.sh (100%) rename scripts/lang_adapt/{ => scripts}/run_clm_adpt_vn.sh (100%) rename scripts/lang_adapt/{ => scripts}/run_clm_emb.sh (100%) rename scripts/lang_adapt/{ => scripts}/train_tokenizer_scratch.sh (100%) rename scripts/lang_adapt/{ => scripts}/train_tokenizer_update.sh (100%) diff --git a/scripts/eval/adapters_xnli_de.py b/scripts/archive/eval/adapters_xnli_de.py similarity index 100% rename from scripts/eval/adapters_xnli_de.py rename to scripts/archive/eval/adapters_xnli_de.py diff --git a/scripts/eval/adapters_xnli_de_vn.py b/scripts/archive/eval/adapters_xnli_de_vn.py similarity index 100% rename from scripts/eval/adapters_xnli_de_vn.py rename to scripts/archive/eval/adapters_xnli_de_vn.py diff --git a/scripts/eval_xnli/adapters_eval.py b/scripts/archive/eval_xnli/adapters_eval.py similarity index 100% rename from scripts/eval_xnli/adapters_eval.py rename to scripts/archive/eval_xnli/adapters_eval.py diff --git a/scripts/eval_xnli/adapters_xlsum_de.py b/scripts/archive/eval_xnli/adapters_xlsum_de.py similarity index 100% rename from scripts/eval_xnli/adapters_xlsum_de.py rename to scripts/archive/eval_xnli/adapters_xlsum_de.py diff --git a/scripts/eval_xnli/crosslingual_exp.sh b/scripts/archive/eval_xnli/crosslingual_exp.sh similarity index 100% rename from scripts/eval_xnli/crosslingual_exp.sh rename to scripts/archive/eval_xnli/crosslingual_exp.sh diff --git a/scripts/eval/run_eval_xnli_zero_shot.sh b/scripts/eval/xnli/run_eval_xnli_zero_shot.sh similarity index 100% rename from scripts/eval/run_eval_xnli_zero_shot.sh rename to scripts/eval/xnli/run_eval_xnli_zero_shot.sh diff --git a/scripts/eval/train_xnli_zero_shot.sh b/scripts/eval/xnli/train_xnli_zero_shot.sh similarity index 100% rename from scripts/eval/train_xnli_zero_shot.sh rename to scripts/eval/xnli/train_xnli_zero_shot.sh diff --git a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh index 084e19d..b100574 100644 --- a/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh +++ b/scripts/exp_sentence_retrievale_eval/compute_retrieval_acc.sh @@ -13,4 +13,4 @@ model=$1 dataset=$2 outdir=$model/retrieval_acc-${dataset} mkdir -p $outdir -python eval_sentence_retrieval.py $outdir --pretrained_model $model --tokenizer $model --dataset $dataset --pooling "max_min" +python eval_sentence_retrieval.py $outdir --pretrained_model $model --tokenizer $model --dataset $dataset --pooling "max_min" \ No newline at end of file diff --git a/scripts/lang_adapt/run_clm_adpt.sh b/scripts/lang_adapt/scripts/run_clm_adpt.sh similarity index 100% rename from scripts/lang_adapt/run_clm_adpt.sh rename to scripts/lang_adapt/scripts/run_clm_adpt.sh diff --git a/scripts/lang_adapt/run_clm_adpt_vn.sh b/scripts/lang_adapt/scripts/run_clm_adpt_vn.sh similarity index 100% rename from scripts/lang_adapt/run_clm_adpt_vn.sh rename to scripts/lang_adapt/scripts/run_clm_adpt_vn.sh diff --git a/scripts/lang_adapt/run_clm_emb.sh b/scripts/lang_adapt/scripts/run_clm_emb.sh similarity index 100% rename from scripts/lang_adapt/run_clm_emb.sh rename to scripts/lang_adapt/scripts/run_clm_emb.sh diff --git a/scripts/lang_adapt/train_tokenizer_scratch.sh b/scripts/lang_adapt/scripts/train_tokenizer_scratch.sh similarity index 100% rename from scripts/lang_adapt/train_tokenizer_scratch.sh rename to scripts/lang_adapt/scripts/train_tokenizer_scratch.sh diff --git a/scripts/lang_adapt/train_tokenizer_update.sh b/scripts/lang_adapt/scripts/train_tokenizer_update.sh similarity index 100% rename from scripts/lang_adapt/train_tokenizer_update.sh rename to scripts/lang_adapt/scripts/train_tokenizer_update.sh From 8c811bba3bc8cd3fbc18aeed9ff3a3b09b7963aa Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 30 Jun 2022 10:37:58 -0400 Subject: [PATCH 119/142] WIP --- scripts/eval/eval.py | 57 ++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index afe7b88..c85f637 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -65,7 +65,7 @@ parser.add_argument("--baseline", default=False, action="store_true") parser.add_argument("--deepspeed", required=False) -task_layers = ["task-adapters", "last-layer"] +task_layers = ["task-adapters", "last-layer", "full-model"] parser.add_argument("--task_layers", choices=task_layers, required=True) @@ -160,7 +160,6 @@ if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - # TODO: we probably need better code for this than multiple if-else statements en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) if en_tokenizer.pad_token is None: @@ -376,10 +375,42 @@ def print_model_trainable_layers(model): def load_model(args, inference=False): def make_last_layer_trainable(args, model, inference=False): + if model is None: + if not inference: + model_path = args.original_model + else: + model_path = args.pretrained_adapters_dir + print(f"Loaded model from {model_path}") + model = model_class_mapping[args.dataset].from_pretrained(model_path, + pad_token_id=pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision, + **optional_model_kwargs) model.freeze_model(freeze=True) return model + + def make_base_model_trainable(args, model, inference=False): + if model is None: + if not inference: + model_path = args.original_model + else: + model_path = args.pretrained_adapters_dir + print(f"Loaded model from {model_path}") + model = model_class_mapping[args.dataset].from_pretrained(model_path, + pad_token_id=pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision, + **optional_model_kwargs) + model.freeze_model(freeze=False) + return model def load_task_specific_adapters(args, model, inference=False): + if model is None: + model = model_class_mapping[args.dataset].from_pretrained(args.original_model, + pad_token_id=pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision, + **optional_model_kwargs) if not inference: model.add_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") @@ -390,14 +421,6 @@ def load_task_specific_adapters(args, model, inference=False): return model def load_embedding_layers(args, tokenizer, model): - ###### legacy code - # # use original causal LM model to load the embedding layers - # causal_lm_model = AutoModelForCausalLM.from_pretrained(args.original_model) - # causal_lm_model.resize_token_embeddings(len(tokenizer)) - # if not args.original_model == args.adapted_model_dir: - # causal_lm_model.transformer.wte = wte - # causal_lm_model.transformer.wpe = wpe - if "tr5b-1B3" in args.original_model: # previous 1.3B bigsience model token_embedding = torch.load(f'{args.adapted_model_dir}/embedding_wte.pt') add_embedding = torch.load(f'{args.adapted_model_dir}/embedding_wpe.pt') @@ -418,21 +441,25 @@ def load_language_adapters(args, model): return model pad_token_id = en_tokenizer.pad_token_id if (not inference and args.cross_lingual) else tokenizer.pad_token_id - model = model_class_mapping[args.dataset].from_pretrained(args.original_model, - pad_token_id=pad_token_id, - cache_dir=args.cache_dir, - revision=args.revision, - **optional_model_kwargs) # baseline: only need to add task-specific adapters # (keeps separated for now for easier debugging) if args.baseline: + model = None if args.task_layers == "task-adapters": model = load_task_specific_adapters(args, model, inference) elif args.task_layers == "last-layer": model = make_last_layer_trainable(args, model, inference) + elif args.task_layers == "full-model": + model = make_base_model_trainable(args, model, inference) return model + model = model_class_mapping[args.dataset].from_pretrained(args.original_model, + pad_token_id=pad_token_id, + cache_dir=args.cache_dir, + revision=args.revision, + **optional_model_kwargs) + # adapted models if not args.cross_lingual or inference: model = load_embedding_layers(args, tokenizer, model) From a8486d4c661434d154afc613bf4a4ce410877d32 Mon Sep 17 00:00:00 2001 From: yongzx Date: Fri, 1 Jul 2022 09:48:55 -0400 Subject: [PATCH 120/142] update eval/scripts_* directory --- scripts/eval/{wikiann => scripts_wikiann}/adpt_wikiann_de.sh | 0 scripts/eval/{wikiann => scripts_wikiann}/baseline_wikiann_de.sh | 0 scripts/eval/{xnli => scripts_xnli}/run_eval_xnli_zero_shot.sh | 0 scripts/eval/{xnli => scripts_xnli}/train_xnli_zero_shot.sh | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename scripts/eval/{wikiann => scripts_wikiann}/adpt_wikiann_de.sh (100%) rename scripts/eval/{wikiann => scripts_wikiann}/baseline_wikiann_de.sh (100%) rename scripts/eval/{xnli => scripts_xnli}/run_eval_xnli_zero_shot.sh (100%) rename scripts/eval/{xnli => scripts_xnli}/train_xnli_zero_shot.sh (100%) diff --git a/scripts/eval/wikiann/adpt_wikiann_de.sh b/scripts/eval/scripts_wikiann/adpt_wikiann_de.sh similarity index 100% rename from scripts/eval/wikiann/adpt_wikiann_de.sh rename to scripts/eval/scripts_wikiann/adpt_wikiann_de.sh diff --git a/scripts/eval/wikiann/baseline_wikiann_de.sh b/scripts/eval/scripts_wikiann/baseline_wikiann_de.sh similarity index 100% rename from scripts/eval/wikiann/baseline_wikiann_de.sh rename to scripts/eval/scripts_wikiann/baseline_wikiann_de.sh diff --git a/scripts/eval/xnli/run_eval_xnli_zero_shot.sh b/scripts/eval/scripts_xnli/run_eval_xnli_zero_shot.sh similarity index 100% rename from scripts/eval/xnli/run_eval_xnli_zero_shot.sh rename to scripts/eval/scripts_xnli/run_eval_xnli_zero_shot.sh diff --git a/scripts/eval/xnli/train_xnli_zero_shot.sh b/scripts/eval/scripts_xnli/train_xnli_zero_shot.sh similarity index 100% rename from scripts/eval/xnli/train_xnli_zero_shot.sh rename to scripts/eval/scripts_xnli/train_xnli_zero_shot.sh From 127adf1d8c0a8855a1cf439e8a5ffcd2de70e040 Mon Sep 17 00:00:00 2001 From: Yong Zheng-Xin Date: Fri, 1 Jul 2022 23:21:55 +0800 Subject: [PATCH 121/142] Update README.md --- scripts/lang_adapt/README.md | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index 35efb6a..3498b3c 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -1,24 +1,37 @@ # README ### Tokenizer and Tokenization of Dataset -Run `tokenized4clm.py` to train the tokenizer on OSCAR dataset. -- `lang`: language name (e.g., "de", "th") -- `model`: model that uses this tokenizer (e.g., "gpt2", "bigscience/bloom-1b3`) -- `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `{lang}_oscar_tokenizer_{vocab_size}` -- `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. -- `vocab_size`: vocab size of the tokenizer - - Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dataset. - `lang`: language name (e.g., "de", "th") -- `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `{lang}_oscar_tokenizer_{vocab_size}` -- `hf_cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. +- `model`: original tokenizer (e.g., "bigscience/bloom-1b3") +- `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `tok_${model}_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_{replace/extend}` +- `cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. - `vocab_size`: vocab size of the tokenizer - `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) +- `use_auth_token`: must be used for BLOOM model +- `extend`: if set, it means that we are extending instead of replacing. + +``` +tokenizer_dir=... # directory to save trained tokenizer +cache_dir=... # directory to cache downloaded HF model +lang=... # language +sample_size=... # training sample size +vocab_size=... # vocab size of tokenizer +model="bigscience/bloom-1b3" +python ./scripts/lang_adapt/tokenized4clm_sampled.py \ +--lang $lang \ +--model $model \ +--tokenizer_dir \ +--hf_cache_dir $cache_dir \ +--vocab_size $vocab_size \ +--sample_size $sample_size \ +--use_auth_token +--extend # use "extend" for the embedding strategy of extending vocab. +``` --- ### Language Adaptation (6 Combinations) - use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. - use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. - - Hack: after `trainer.save_model()`, manually save the `wte` and `wpe` weights. \ No newline at end of file + - Hack: after `trainer.save_model()`, manually save the `wte` and `wpe` weights. From f3223e443e9f89e22b1e1597a194008df524177d Mon Sep 17 00:00:00 2001 From: Yong Zheng-Xin Date: Fri, 1 Jul 2022 23:30:33 +0800 Subject: [PATCH 122/142] Update README.md --- scripts/lang_adapt/README.md | 80 +++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index 3498b3c..8b1b38c 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -7,7 +7,7 @@ Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dat - `tokenizer_dir`: path directory to save the tokenizer. The tokenizer will be saved as `tok_${model}_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_{replace/extend}` - `cache_dir` (default is "~/.cache/huggingface/transformers"): cache directory for downloading the OSCAR dataset and GPT2 tokenizer. - `vocab_size`: vocab size of the tokenizer -- `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) +- `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) - `use_auth_token`: must be used for BLOOM model - `extend`: if set, it means that we are extending instead of replacing. @@ -31,7 +31,77 @@ python ./scripts/lang_adapt/tokenized4clm_sampled.py \ ``` --- -### Language Adaptation (6 Combinations) -- use `sbatch run_clm_emb.sh` to perform language adaptation with (emb-only, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_emb.sh`. -- use `sbatch run_clm_adpt.sh` to perform language adaptation with (emb-and-adpt, replace-vocab) strategies. Replace the LANG variable for the desired language (currently is `th`). Currently, the script uses slurm-job-array to control the size of the oscar training corpora and `ADPT_REDUCTION_FACTOR` to control the reduction factor of adapter modules. Note: remember to change the SLURM logging output files, `tokenizer_dir`, `cache_dir`, `output_dir`, and `logging_dir` in `run_clm_adpt.sh`. - - Hack: after `trainer.save_model()`, manually save the `wte` and `wpe` weights. +### Language Adaptation +Run `madx_run_clm.py` to finetune language model on a new language. +- `LANG`: language name (e.g., "de", "th") on OSCAR +- `DATA_SAMPLES`: training sample size +- `VOCAB_SIZE`: vocab size of the tokenizer +- `BIGS_MODEL`: bigscience model +- `ADPT_STRATEGY`: language adaptation strategy (train only embedding for now: `"emb"`) +- `EMBD_SRATEGY`: embedding strategy. Either `"replace"` (replace the embedding layer entirely), `"overlap-replace"` (replace but initialize seen vocab with pretrained embedding), or `"extend"` (freeze seen vocab embeddings and add trainable embeddings for unseen vocab) +- `TOK_STRATEGY`: tokenization strategy (either `"replace"` (for embedding strategy of "replace" and "overlap-replace") or `"extend"`) +- `tokenizer_dir`: saved tokenizer directory (used in the tokenization script above) +- `cache_dir`: (as above) +- `output_dir`: directory to save adapted model +- `logging_dir`: directory to log loss curves to tensorboard +- `MAX_STEPS`: training steps +- `EVAL_STEPS`: number of training steps between two evaluations +- `SAVE_STEPS`: number of training steps between saving the checkpoints. +``` +LANG=... # language +DATA_SAMPLES=... # training sample size +VOCAB_SIZE=... # vocab size of newly trained tokenizer +BIGS_MODEL="bigscience/bloom-1b3" +ADPT_STRATEGY="emb" # language adaptation strategy (train only embedding for now) +EMBD_SRATEGY=... # either "replace", "overlap-replace", or "extend" +TOK_STRATEGY=... # either "replace" (for embedding strategy of "replace" and "overlap-replace") or "extend" + +tokenizer_dir=... # as above +tokenizer_dir="${tokenizer_dir}/tok_${BIGS_MODEL##*/}_${LANG}_oscar_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${TOK_STRATEGY}" +cache_dir=... # as above + +output_dir=... # directory to save adapted model +output_dir="${output_dir}/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" +logging_dir=... # directory to log loss curves to tensorboard +logging_dir="${logging_dir}/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY} + +mkdir -p $output_dir +mkdir -p $logging_dir + +MAX_STEPS=50000 +EVAL_STEPS=5000 +SAVE_STEPS=5000 + +python ./scripts/lang_adapt/madx_run_clm.py \ + --seed 0 \ + --fp16 \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_${LANG}" \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps $EVAL_STEPS \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --save_steps $SAVE_STEPS \ + --save_strategy "steps" \ + --max_train_samples $DATA_SAMPLES \ + --max_steps $MAX_STEPS \ + --logging_steps 1000 \ + --lang_adapt_strategies $ADPT_STRATEGY \ + --embedding_strategies $EMBD_SRATEGY \ + --load_best_model_at_end \ + --use_auth_token +``` From 153f7da59aa923070aaea9c48e3ca38dd06593d2 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 1 Jul 2022 23:16:25 +0700 Subject: [PATCH 123/142] Delete calculate_bias_changes.py --- scripts/lang_adapt/calculate_bias_changes.py | 49 -------------------- 1 file changed, 49 deletions(-) delete mode 100644 scripts/lang_adapt/calculate_bias_changes.py diff --git a/scripts/lang_adapt/calculate_bias_changes.py b/scripts/lang_adapt/calculate_bias_changes.py deleted file mode 100644 index fdb72c0..0000000 --- a/scripts/lang_adapt/calculate_bias_changes.py +++ /dev/null @@ -1,49 +0,0 @@ -from transformers import AutoModel - -model_0 = AutoModel.from_pretrained() -model_F = AutoModel.from_pretrained() - - -for (name_0, param_0), (name_F, param_F) in zip(model_0.named_parameters(), model_F.named_parameters()): - - param_name = - if "bias" in name_0: - if "query_key_value": - - # Query, Key, Value are merged in one MLP, - # so we need to seperate the bias terms - - head_size = model_0.config.hidden_size // model_0.config.num_attention_heads - - _q_change = None - _k_change = None - _v_change = None - for qkv_bias in [param_0, param_F]: - qkv_bias = qkv_bias.view(num_attention_heads, 3*head_size) - - if _q_change is None: - _q_change = qkv_bias[..., :head_size] - else: - _q_change -= qkv_bias[..., :head_size] - _q_change = torch.norm(_q_change) - - if _k_change is None: - _k_change = qkv_bias[..., head_size: 2 * head_size] - else: - _k_change -= qkv_bias[..., head_size: 2 * head_size] - _k_change = torch.norm(_k_change) - - if _v_change is None: - _v_change = qkv_bias[..., 2 * head_size:] - else: - _v_change -= qkv_bias[..., 2 * head_size:] - _v_change = torch.norm(_v_change) - else: - bias_change = torch.norm(param_0 - param_F) - -transformer.h.0.input_layernorm.bias -transformer.h.0.self_attention.query_key_value.bias -transformer.h.0.self_attention.dense.bias -transformer.h.0.post_attention_layernorm.bias -transformer.h.0.mlp.dense_h_to_4h.bias -transformer.h.0.mlp.dense_4h_to_h.bias \ No newline at end of file From 7c48c20a7f1b74448268f4f4f58fddecc211381e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 1 Jul 2022 23:58:00 +0700 Subject: [PATCH 124/142] removed finetune_strategies in favor of lang_adapt_strategis --- scripts/lang_adapt/madx_run_clm.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index e6e3caa..c29ed59 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -111,10 +111,6 @@ class ModelArguments: default="", metadata={"help": "choose one of the two strategies - 'replace', 'extend', 'overlap-replace'"}, ) - finetuning_strategies: str = field( - default="full", - metadata={"help": "choose one of the three strategies - 'full', 'bitfit'"}, - ) adapter_placement: str = field( default="all", metadata={"help": "list of layers where to place the adapters: all: use all layers, '17,24': list layers id separated by ','"}, @@ -580,16 +576,6 @@ def zero_grad(grad): #elif model_args.embedding_strategies == "replace": # model.resize_token_embeddings(len(tokenizer)) - print(f"✅ Use Finetuning Strategy: {model_args.finetuning_strategies}") - - if model_args.finetuning_strategies == "bitfit": - for name, param in model.transformer.named_parameters(): - if 'bias' not in name: - param.requires_grad = False - elif model_args.finetuning_strategies == "full": - # No modification needed - pass - trainable_params = 0 frozen_params = 0 emb_params = 0 @@ -599,6 +585,10 @@ def zero_grad(grad): emb_params += param.numel() elif model_args.lang_adapt_strategies == "emb": param.requires_grad = False + elif model_args.lang_adapt_strategies == "bitfit": + for name, param in model.transformer.named_parameters(): + if 'bias' not in name: + param.requires_grad = False if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") @@ -630,7 +620,7 @@ def main(): training_args.data_dir = f'{training_args.output_dir}' - assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') + assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'bitfit') assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') # Setup logging From 1c49adf7ffc1ba0d395d115100499efe438f005f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 2 Jul 2022 13:29:23 +0700 Subject: [PATCH 125/142] Update README.md --- scripts/lang_adapt/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index 8b1b38c..20d2d05 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -22,7 +22,7 @@ model="bigscience/bloom-1b3" python ./scripts/lang_adapt/tokenized4clm_sampled.py \ --lang $lang \ --model $model \ ---tokenizer_dir \ +--tokenizer_dir $tokenizer_dir \ --hf_cache_dir $cache_dir \ --vocab_size $vocab_size \ --sample_size $sample_size \ From e955acfc8488778fd8a1288ac753b517b4d17607 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 2 Jul 2022 14:51:20 +0700 Subject: [PATCH 126/142] Update README.md --- scripts/lang_adapt/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index 20d2d05..e20912d 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -63,7 +63,7 @@ cache_dir=... # as above output_dir=... # directory to save adapted model output_dir="${output_dir}/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" logging_dir=... # directory to log loss curves to tensorboard -logging_dir="${logging_dir}/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY} +logging_dir="${logging_dir}/${BIGS_MODEL##*/}_${LANG}_${ADPT_STRATEGY}_${DATA_SAMPLES}samples_${VOCAB_SIZE}vocab_${EMBD_SRATEGY}" mkdir -p $output_dir mkdir -p $logging_dir From 2484b22901d1538e41656bdbbfb5f94df1261fbe Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 2 Jul 2022 15:04:26 +0700 Subject: [PATCH 127/142] fixed logic --- scripts/lang_adapt/madx_run_clm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index f7cecb6..4d56154 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -104,7 +104,7 @@ class ModelArguments: }, ) lang_adapt_strategies: str = field( - default="", + default=None, metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt'"}, ) embedding_strategies: str = field( @@ -620,10 +620,10 @@ def zero_grad(grad): if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name: param.requires_grad = True emb_params += param.numel() - elif model_args.lang_adapt_strategies == "emb": - param.requires_grad = False - elif model_args.lang_adapt_strategies == "bitfit": - for name, param in model.transformer.named_parameters(): + elif model_args.lang_adapt_strategies is not None: + if model_args.lang_adapt_strategies == "emb": + param.requires_grad = False + elif model_args.lang_adapt_strategies == "bitfit": if 'bias' not in name: param.requires_grad = False From 81ace497bc86cf613b4c14176cd755aa650492a6 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 4 Jul 2022 22:53:00 -0400 Subject: [PATCH 128/142] jz --- jz/README.md | 59 ++++++++++++ jz/emb.sh | 99 +++++++++++++++++++++ requirements.txt | 5 ++ scripts/eval/eval.py | 5 +- scripts/lang_adapt/tokenized4clm_sampled.py | 20 ++--- scripts/requirements.txt | 4 - 6 files changed, 176 insertions(+), 16 deletions(-) create mode 100644 jz/README.md create mode 100644 jz/emb.sh create mode 100644 requirements.txt delete mode 100644 scripts/requirements.txt diff --git a/jz/README.md b/jz/README.md new file mode 100644 index 0000000..5e3f89e --- /dev/null +++ b/jz/README.md @@ -0,0 +1,59 @@ +# Run on JZ + +## Getting Started +``` +git clone https://github.com/bigscience-workshop/multilingual-modeling.git +cd multilingual-modeling/ +``` + +## Change Configuration +### SLURM Configuration +``` +# use a single V100 for each run +#SBATCH --partition=gpu-he --gres=gpu:1 + +# output/error files for tracking pip installation +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/misc/lang-adapt-env_jz_lang_adapter.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/misc/lang-adapt-env_jz_lang_adapter.err +``` + +### Folders configuration (Line 22 - 28 in jz/emb.sh) +``` +# virtual environment folder for `python3 -m venv $env_dir` +env_dir="/users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/jz/env_jz_lang_adapter" + +# cache directory for HuggingFace datasets +cache_dir="/users/zyong2/data/zyong2/huggingface" + +# cloned GitHub directory +mm_dir="/users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling" + +# directory to save adapted models and trained tokenizers +output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/misc/" + +# folder for storing error and output logging text files +logging_txt_dir="/users/zyong2/data/zyong2/bigscience/logs/misc" + +# folder for storing all tensorboard logging +logging_tb_dir="/users/zyong2/data/zyong2/bigscience/reports/misc/" +``` + +## Runs +### 07/05/2022 (Language Adaptation - Embedding-only) +``` +sbatch jz/emb.sh my 100000 24000 extend +sbatch jz/emb.sh my 10000 5000 extend +sbatch jz/emb.sh my 1000 5000 extend + +sbatch jz/emb.sh si 100000 24000 extend +sbatch jz/emb.sh si 10000 5000 extend +sbatch jz/emb.sh si 1000 5000 extend + +sbatch jz/emb.sh az 100000 24000 extend +sbatch jz/emb.sh az 10000 5000 extend +sbatch jz/emb.sh az 1000 5000 extend + +sbatch jz/emb.sh de 100000 24000 extend +sbatch jz/emb.sh de 10000 5000 extend +sbatch jz/emb.sh de 1000 5000 extend +``` \ No newline at end of file diff --git a/jz/emb.sh b/jz/emb.sh new file mode 100644 index 0000000..0ea814b --- /dev/null +++ b/jz/emb.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Request half an hour of runtime: +#SBATCH --time=2-23:59:00 + +# Ask for the GPU partition and 1 GPU +#SBATCH --partition=gpu-he --gres=gpu:1 + +# Default resources are 1 core with 2.8GB of memory. +#SBATCH --ntasks=8 + +# Use more memory (10GB) (CPU RAM): +#SBATCH --mem=200g + +# Specify a job name: +#SBATCH -J lang-adapt-env_jz_lang_adapter + +# Specify an output file +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/misc/lang-adapt-env_jz_lang_adapter.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/misc/lang-adapt-env_jz_lang_adapter.err + +env_dir="/users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/jz/env_jz_lang_adapter" +cache_dir="/users/zyong2/data/zyong2/huggingface" +mm_dir="/users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling" + +output_dir="/users/zyong2/data/zyong2/bigscience/data/processed/misc/" # adapted model and trained tokenizer directory +logging_txt_dir="/users/zyong2/data/zyong2/bigscience/logs/misc" # error and output logging +logging_tb_dir="/users/zyong2/data/zyong2/bigscience/reports/misc/" # tensorboard logging + +mkdir -p $output_dir +mkdir -p $logging_tb_dir +mkdir -p $logging_txt_dir + +lang=$1 # language +sample_size=$2 # training sample size +vocab_size=$3 # vocab size of tokenizer +tok_strategy=$4 # extend, replace, overlap-replace +bigs_model="bigscience/bloom-1b3" +adpt_strategy="emb" + +tokenizer_dir="${output_dir}/tok_$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_${tok_strategy}" +logging_tb_dir="${logging_tb_dir}/$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_tok-${tok_strategy}_adpt-${adpt_strategy}" + +# setup environment +module load python/3.7.4 +[ -d $env_dir ] || python3 -m venv $env_dir +source "${env_dir}/bin/activate" +# pip3 install --upgrade pip +# pip3 install -r "${mm_dir}/requirements.txt" + +# train tokenizer +python "${mm_dir}/scripts/lang_adapt/tokenized4clm_sampled.py" \ +--lang $lang \ +--model $bigs_model \ +--tokenizer_dir $tokenizer_dir \ +--hf_cache_dir $cache_dir \ +--vocab_size $vocab_size \ +--sample_size $sample_size \ +--use_auth_token \ +--tok_strategy $tok_strategy \ +> "${logging_txt_dir}/tok_$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_${tok_strategy}.txt" \ +2> "${logging_txt_dir}/tok_$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_${tok_strategy}.err" + + +# finetune language model for langauge adaptation +python "${mm_dir}/scripts/lang_adapt/madx_run_clm.py" \ + --seed 0 \ + --fp16 \ + --model_name_or_path $bigs_model \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --cache_dir $cache_dir \ + --dataset_config_name "unshuffled_deduplicated_${lang}" \ + --logging_dir $logging_tb_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 4 \ + --per_device_eval_batch_size 2 \ + --eval_accumulation_steps 4 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --save_steps 5000 \ + --save_strategy "steps" \ + --max_train_samples $sample_size \ + --max_steps 50000 \ + --logging_steps 1000 \ + --lang_adapt_strategies $adpt_strategy \ + --embedding_strategies $tok_strategy \ + --load_best_model_at_end \ + --use_auth_token \ + > "${logging_txt_dir}/$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_tok-${tok_strategy}_adpt-${adpt_strategy}.txt" \ + 2> "${logging_txt_dir}/$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_tok-${tok_strategy}_adpt-${adpt_strategy}.err" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f3f0d9b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +git+https://github.com/yongzx/adapter-transformers.git@f55ab013599088a35c87a880ba13a6d912e27ef4 +--extra-index-url https://download.pytorch.org/whl/cu113 +torch +datasets +tensorboardX \ No newline at end of file diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index c85f637..b03a927 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -155,7 +155,7 @@ logger.info(f"test = {len(test_dataset)} samples") # load tokenizer -logger.info("Loading tokenizer...") +logger.info(f"Loading tokenizer from {args.tokenizer}...") tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -416,6 +416,7 @@ def load_task_specific_adapters(args, model, inference=False): model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") return model else: + print(f"[Evaluation] Load task adapters from {args.pretrained_adapters_dir}/{args.dataset.split('/')[-1]}-task-adapter") adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset.split('/')[-1]}-task-adapter") model.set_active_adapters(adapter_name) return model @@ -433,11 +434,13 @@ def load_embedding_layers(args, tokenizer, model): model.transformer.word_embeddings = token_embedding model.transformer.word_embeddings_layernorm = add_embedding + logger.info(f"Replaced embeddings with {token_embedding} and {add_embedding}...") return model def load_language_adapters(args, model): adapter_name = model.load_adapter(args.madx_lang_adapter, config="pfeiffer+inv") model.set_active_adapters(adapter_name) + logger.info(f"Added Adapter {args.madx_lang_adapter}...") return model pad_token_id = en_tokenizer.pad_token_id if (not inference and args.cross_lingual) else tokenizer.pad_token_id diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index a221e87..5955c94 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -28,17 +28,15 @@ parser.add_argument('--lang', type=str, required=True) parser.add_argument('--model', type=str, required=True) parser.add_argument('--tokenizer_dir', type=str, required=True) +parser.add_argument('--tok_strategy', type=str, choices=["replace", "extend"] ,required=True) parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) -parser.add_argument('--vocab_size', default=130_000, type=int) -parser.add_argument('--extend_vocab', action='store_true') -parser.add_argument('--sample_size', default=None, type=int) +parser.add_argument('--vocab_size', default=24_000, type=int) +parser.add_argument('--sample_size', default=100_000, type=int) parser.add_argument("--use_auth_token", default=False, action="store_true") parser.add_argument("--seed", default=42, type=int) args = parser.parse_args() lang = args.lang -if args.extend_vocab: - assert args.vocab_size < 100_000 if args.sample_size: raw_datasets = load_dataset( @@ -67,7 +65,7 @@ def batch_iterator(): unique_toks = set() model_name = pathlib.Path(args.model).parts[-1] -if args.extend_vocab: +if args.tok_strategy == 'extend': # Yong: have checked that added tokens would have indices after the original vocab size. tokenizer = AutoTokenizer.from_pretrained(args.model) assert tokenizer.is_fast @@ -76,14 +74,14 @@ def batch_iterator(): added = tokenizer.add_tokens([tok for tok in new_tokenizer.vocab.keys()]) print([tok for tok in new_tokenizer.vocab.keys()]) print(f"Overlap with previous vocab: {args.vocab_size - added}") - tokenizer.save_pretrained(f"{args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_extend") - print(f"Saved tokenizer to {args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_extend") + tokenizer.save_pretrained(f"{args.tokenizer_dir}") + print(f"Saved tokenizer to {args.tokenizer_dir}") -else: +elif args.tok_strategy == 'replace': tokenizer = AutoTokenizer.from_pretrained(args.model, use_auth_token=args.use_auth_token) assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) print("Unique toks, ", len(unique_toks)) print("✅ Trained tokenizer with len ", len(new_tokenizer)) - new_tokenizer.save_pretrained(f"{args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_replace") - print(f"Saved tokenizer to {args.tokenizer_dir}/tok_{model_name}_{lang}_oscar_{args.sample_size}samples_{args.vocab_size}vocab_replace") + new_tokenizer.save_pretrained(f"{args.tokenizer_dir}") + print(f"Saved tokenizer to {args.tokenizer_dir}") diff --git a/scripts/requirements.txt b/scripts/requirements.txt deleted file mode 100644 index fd49f62..0000000 --- a/scripts/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -git+https://github.com/yongzx/adapter-transformers.git@bloom -datasets -torch --extra-index-url https://download.pytorch.org/whl/cu113 -tensorboardX \ No newline at end of file From d3feb31f71d63e38081a18f7ba51887b90ff4351 Mon Sep 17 00:00:00 2001 From: yongzx Date: Mon, 4 Jul 2022 22:59:06 -0400 Subject: [PATCH 129/142] update README --- jz/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jz/README.md b/jz/README.md index 5e3f89e..684755c 100644 --- a/jz/README.md +++ b/jz/README.md @@ -1,6 +1,8 @@ # Run on JZ ## Getting Started +Clone the GitHub Repository and `cd` into it to run commands like `sbatch jz/emb.sh my 100000 24000 extend`. + ``` git clone https://github.com/bigscience-workshop/multilingual-modeling.git cd multilingual-modeling/ @@ -8,6 +10,7 @@ cd multilingual-modeling/ ## Change Configuration ### SLURM Configuration +We need to change the SLURM setting according to JZ to get the necessary compute. ``` # use a single V100 for each run #SBATCH --partition=gpu-he --gres=gpu:1 @@ -17,7 +20,8 @@ cd multilingual-modeling/ #SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/misc/lang-adapt-env_jz_lang_adapter.err ``` -### Folders configuration (Line 22 - 28 in jz/emb.sh) +### Directory configuration (Line 22 - 28 in jz/emb.sh) +Also, we need to change 6 lines of the directory configuration. ``` # virtual environment folder for `python3 -m venv $env_dir` env_dir="/users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/jz/env_jz_lang_adapter" @@ -40,6 +44,7 @@ logging_tb_dir="/users/zyong2/data/zyong2/bigscience/reports/misc/" ## Runs ### 07/05/2022 (Language Adaptation - Embedding-only) +Run the following commands for doing language adaptation for 4 languages varying along the the size of training samples. ``` sbatch jz/emb.sh my 100000 24000 extend sbatch jz/emb.sh my 10000 5000 extend From 1731027bc6a9bf1879d1ad1065e09ec00e8c18d2 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 13:27:46 -0400 Subject: [PATCH 130/142] update --- scripts/lang_adapt/madx_run_clm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 4d56154..c2284d7 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -488,7 +488,9 @@ def get_adapter_config(adapter_args, model_args): dropout = adapter_args.dropout_lora, init_weights = adapter_args.init_weights_lora, ) + else: + # TODO: confirm with Vassilina what goes into this condition if model_args.adapter_placement == "all": adapter_config = AdapterConfig.load( adapter_args.adapter_config, @@ -620,6 +622,7 @@ def zero_grad(grad): if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name: param.requires_grad = True emb_params += param.numel() + elif model_args.lang_adapt_strategies is not None: if model_args.lang_adapt_strategies == "emb": param.requires_grad = False @@ -633,7 +636,6 @@ def zero_grad(grad): else: print(f"🚀 Trainable layer '{name}'") trainable_params += param.numel() - print(f"Total frozen parameters: {frozen_params}") print(f"Total emb parameters (wte, wpe): {emb_params}") From 3fd568a0e50ef263c38296fb3b8646f9169f9b14 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 13:43:22 -0400 Subject: [PATCH 131/142] uncomment pip install --- jz/emb.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jz/emb.sh b/jz/emb.sh index 0ea814b..06a34d3 100644 --- a/jz/emb.sh +++ b/jz/emb.sh @@ -45,8 +45,8 @@ logging_tb_dir="${logging_tb_dir}/$(basename $bigs_model)_${lang}_oscar_${sample module load python/3.7.4 [ -d $env_dir ] || python3 -m venv $env_dir source "${env_dir}/bin/activate" -# pip3 install --upgrade pip -# pip3 install -r "${mm_dir}/requirements.txt" +pip3 install --upgrade pip +pip3 install -r "${mm_dir}/requirements.txt" # train tokenizer python "${mm_dir}/scripts/lang_adapt/tokenized4clm_sampled.py" \ From 2f806ebc9c878ad2e1cc8170293895609d310924 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 13:46:36 -0400 Subject: [PATCH 132/142] update tokenizer training --- scripts/lang_adapt/README.md | 18 ++++++++++-------- scripts/lang_adapt/tokenized4clm_sampled.py | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index e20912d..f49178e 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -9,25 +9,27 @@ Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dat - `vocab_size`: vocab size of the tokenizer - `sample_size`: the amount of samples to use to train the tokenizer (randomly selected) - `use_auth_token`: must be used for BLOOM model -- `extend`: if set, it means that we are extending instead of replacing. +- `tok_strategy`: extend, replace or overlap-replace ``` -tokenizer_dir=... # directory to save trained tokenizer -cache_dir=... # directory to cache downloaded HF model +cache_dir=... lang=... # language sample_size=... # training sample size vocab_size=... # vocab size of tokenizer -model="bigscience/bloom-1b3" +tok_strategy=... # extend, replace, overlap-replace +bigs_model="bigscience/bloom-1b3" + +tokenizer_dir="${output_dir}/tok_$(basename $bigs_model)_${lang}_oscar_${sample_size}samples_${vocab_size}vocab_${tok_strategy}" python ./scripts/lang_adapt/tokenized4clm_sampled.py \ --lang $lang \ ---model $model \ ---tokenizer_dir $tokenizer_dir \ +--model $MODEL \ +--tokenizer_dir /users/zyong2/data/zyong2/bigscience/data/processed/020 \ --hf_cache_dir $cache_dir \ --vocab_size $vocab_size \ --sample_size $sample_size \ ---use_auth_token ---extend # use "extend" for the embedding strategy of extending vocab. +--use_auth_token \ +--tok_strategy $tok_strategy ``` --- diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index 5955c94..ae76a04 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -77,7 +77,7 @@ def batch_iterator(): tokenizer.save_pretrained(f"{args.tokenizer_dir}") print(f"Saved tokenizer to {args.tokenizer_dir}") -elif args.tok_strategy == 'replace': +elif args.tok_strategy in ('replace', 'overlap-replace'): tokenizer = AutoTokenizer.from_pretrained(args.model, use_auth_token=args.use_auth_token) assert tokenizer.is_fast new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=args.vocab_size) From 2e94a9e82bceb8487d138e3d8320efad146068d6 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 13:48:05 -0400 Subject: [PATCH 133/142] update bigs_model --- scripts/lang_adapt/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index f49178e..ad7232f 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -23,7 +23,7 @@ tokenizer_dir="${output_dir}/tok_$(basename $bigs_model)_${lang}_oscar_${sample_ python ./scripts/lang_adapt/tokenized4clm_sampled.py \ --lang $lang \ ---model $MODEL \ +--model $bigs_model \ --tokenizer_dir /users/zyong2/data/zyong2/bigscience/data/processed/020 \ --hf_cache_dir $cache_dir \ --vocab_size $vocab_size \ From d5209a75cb66bd4934f0898b95d329f95e596144 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 13:54:10 -0400 Subject: [PATCH 134/142] update tokenizer_dir --- scripts/lang_adapt/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index ad7232f..8c2d4da 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -24,7 +24,7 @@ tokenizer_dir="${output_dir}/tok_$(basename $bigs_model)_${lang}_oscar_${sample_ python ./scripts/lang_adapt/tokenized4clm_sampled.py \ --lang $lang \ --model $bigs_model \ ---tokenizer_dir /users/zyong2/data/zyong2/bigscience/data/processed/020 \ +--tokenizer_dir $tokenizer_dir \ --hf_cache_dir $cache_dir \ --vocab_size $vocab_size \ --sample_size $sample_size \ From 5fac29aa0936efc2a57831e1b5d500e01cbfd78c Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 14:23:00 -0400 Subject: [PATCH 135/142] load_best using eval metrics --- scripts/eval/eval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index b03a927..6e1a6cb 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -78,6 +78,7 @@ trainer_no_task_adpt_class_mapping = {XNLI: Trainer, XLSUM: Seq2SeqTrainer, WIKIANN: Trainer} trainer_class_mapping = {XNLI: AdapterTrainer, XLSUM: Seq2SeqAdapterTrainer, WIKIANN: AdapterTrainer} trainer_args_mapping = {XNLI: TrainingArguments, XLSUM: Seq2SeqTrainingArguments, WIKIANN: TrainingArguments} +task_eval_metric_best_model = {XNLI: 'eval_accuracy', WIKIANN: 'eval_overall_f1'} args = parser.parse_args() @@ -362,6 +363,7 @@ def compute_beam_search_metrics(model, dataset): report_to="tensorboard", logging_dir=f"{args.output_dir}/logs", load_best_model_at_end=True, + metric_for_best_model=task_eval_metric_best_model[args.dataset], deepspeed=args.deepspeed, **optional_trainer_args, ) From 7e0feca002778fae17721738ff502af82c02b741 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 14:48:05 -0400 Subject: [PATCH 136/142] load best model --- scripts/eval/eval.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 6e1a6cb..3dcd67e 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -1,6 +1,7 @@ import logging import argparse import os +import json from datasets import load_dataset from datasets import load_metric @@ -521,9 +522,11 @@ def load_language_adapters(args, model): assert len(evaluation_dirs) > 0 print(f"Found {len(evaluation_dirs)} checkpoints") - # load the last checkpoint. - args.pretrained_adapters_dir = f"{args.output_dir}/{evaluation_dirs[-1]}" - print(f"[Evaluation] Loading trained model from {args.pretrained_adapters_dir}") + # load the best checkpoint. + with open(f"{args.output_dir}/{evaluation_dirs[-1]}/trainer_state.json") as rf: + args.pretrained_adapters_dir = json.load(rf)['best_model_checkpoint'] + + print(f"[Evaluation] Loading trained model (best checkpoint) from {args.pretrained_adapters_dir}") model = load_model(args, inference=True) model.eval() From d4a887e0b22f94a11abd883f41580bc901308496 Mon Sep 17 00:00:00 2001 From: yongzx Date: Tue, 5 Jul 2022 23:27:05 -0400 Subject: [PATCH 137/142] missing output_dir --- scripts/lang_adapt/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/lang_adapt/README.md b/scripts/lang_adapt/README.md index 8c2d4da..0c3c6a7 100644 --- a/scripts/lang_adapt/README.md +++ b/scripts/lang_adapt/README.md @@ -13,6 +13,7 @@ Run `tokenized4clm_sampled.py` to train the tokenizer on the subset of OSCAR dat ``` cache_dir=... +output_dir=... lang=... # language sample_size=... # training sample size vocab_size=... # vocab size of tokenizer From 585cb5d871e0ea3ba609948af770ff26a895cc4e Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 7 Jul 2022 09:45:52 -0400 Subject: [PATCH 138/142] scripts for wikiann --- .../scripts_wikiann/baseline_wikiann_de.sh | 30 +++++++------ ...kiann_de.sh => wikiann_de_task_adpters.sh} | 42 ++++++++++--------- 2 files changed, 40 insertions(+), 32 deletions(-) rename scripts/eval/scripts_wikiann/{adpt_wikiann_de.sh => wikiann_de_task_adpters.sh} (55%) diff --git a/scripts/eval/scripts_wikiann/baseline_wikiann_de.sh b/scripts/eval/scripts_wikiann/baseline_wikiann_de.sh index 04b6a1c..a6b78d4 100644 --- a/scripts/eval/scripts_wikiann/baseline_wikiann_de.sh +++ b/scripts/eval/scripts_wikiann/baseline_wikiann_de.sh @@ -5,20 +5,19 @@ # Ask for the GPU partition and 1 GPU #SBATCH --partition=gpu-he --gres=gpu:1 -#SBATCH --array=100 # Default resources are 1 core with 2.8GB of memory. #SBATCH --ntasks=4 # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g +#SBATCH --mem=100g # Specify a job name: -#SBATCH -J exp-021-wikiann-baseline_wikiann_de +#SBATCH -J exp-021-wikiann-baseline_wikiann_de_task_adapters # Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/baseline_wikiann_de.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/baseline_wikiann_de.err +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/baseline_wikiann_de_task_adapters.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/baseline_wikiann_de_task_adapters.err # Set up the environment by loading modules set -a # automatically export all variables @@ -30,11 +29,7 @@ module load gitlfs/2.7.1 source $FP_BIGS/env_try_lang_adapter/bin/activate -OUTPUT_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/021-wikiann/bloom-1b3-baseline-de" # where you want to save checkpoints at -LANG="de" -CACHE_DIR="/users/zyong2/data/zyong2/huggingface" # cache dir for saving/loading HF models and wikiann datasets. - -LR=1e-4 +LR=1e-5 BIGS_MODEL="bigscience/bloom-1b3" MODEL_NAME="bigscience/bloom-1b3" @@ -42,6 +37,11 @@ TOKENIZER_NAME="bigscience/bloom-1b3" # task-specific arguments TASK_DATASET="wikiann" +TASK_LAYER="task-adapters" +LANG="de" +OUTPUT_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/021-wikiann/$(basename $BIGS_MODEL)-baseline-${LANG}-FT-${TASK_LAYER}" # where you want to save checkpoints at +CACHE_DIR="/users/zyong2/data/zyong2/huggingface" # cache dir for saving/loading HF models and XNLI datasets. + mkdir -p $OUTPUT_DIR @@ -50,14 +50,18 @@ $OUTPUT_DIR \ --lang $LANG \ --cache_dir $CACHE_DIR \ --dataset $TASK_DATASET \ ---num_train_epochs 100 \ +--num_train_epochs 5 \ --learning_rate $LR \ --per_device_train_batch_size 8 \ ---gradient_accumulation_steps 1 \ +--gradient_accumulation_steps 4 \ --original_model $BIGS_MODEL \ --adapted_model_dir $MODEL_NAME \ --tokenizer $TOKENIZER_NAME \ --do_train \ --do_predict \ +--task_layers $TASK_LAYER \ --baseline - +# --use_partial_data \ +# --use_partial_train_data 100 \ +# --use_partial_val_data 100 \ +# --use_partial_test_data 100 diff --git a/scripts/eval/scripts_wikiann/adpt_wikiann_de.sh b/scripts/eval/scripts_wikiann/wikiann_de_task_adpters.sh similarity index 55% rename from scripts/eval/scripts_wikiann/adpt_wikiann_de.sh rename to scripts/eval/scripts_wikiann/wikiann_de_task_adpters.sh index 74ddfd8..d80eff5 100644 --- a/scripts/eval/scripts_wikiann/adpt_wikiann_de.sh +++ b/scripts/eval/scripts_wikiann/wikiann_de_task_adpters.sh @@ -5,20 +5,19 @@ # Ask for the GPU partition and 1 GPU #SBATCH --partition=gpu-he --gres=gpu:1 -#SBATCH --array=100 # Default resources are 1 core with 2.8GB of memory. #SBATCH --ntasks=4 # Use more memory (10GB) (CPU RAM): -#SBATCH --mem=50g +#SBATCH --mem=100g # Specify a job name: -#SBATCH -J exp-021-wikiann-adpt_wikiann_de +#SBATCH -J exp-021-wikiann-bloom1b3_extend_wikiann_de_task_adapters # Specify an output file -#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/adpt_wikiann_de.out -#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/adpt_wikiann_de.err +#SBATCH -o /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/bloom1b3_extend_wikiann_de_task_adapters.out +#SBATCH -e /users/zyong2/data/zyong2/bigscience/logs/log-021-wikiann/bloom1b3_extend_wikiann_de_task_adapters.err # Set up the environment by loading modules set -a # automatically export all variables @@ -30,34 +29,39 @@ module load gitlfs/2.7.1 source $FP_BIGS/env_try_lang_adapter/bin/activate -OUTPUT_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/021-wikiann/bloom-1b3-adpt-de" # where you want to save checkpoints at -LANG="de" -CACHE_DIR="/users/zyong2/data/zyong2/huggingface" # cache dir for saving/loading HF models and wikiann datasets. - -LR=1e-4 +LR=1e-5 BIGS_MODEL="bigscience/bloom-1b3" -ADAPTER_MODEL_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/020/bloom-1b3_de_emb-and-adpt_1000samples" -TOKENIZER_NAME="bigscience/bloom-1b3" -MADX="/users/zyong2/data/zyong2/bigscience/data/processed/020/bloom-1b3_de_emb-and-adpt_1000samples/oscar_pfeiffer+inv_de" +MODEL_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/bloom-1b3_de_emb_100000samples_24000vocab_extend" +TOKENIZER_NAME="/users/zyong2/data/zyong2/bigscience/data/processed/020/bloom-1b3_de_emb_100000samples_24000vocab_extend" # task-specific arguments TASK_DATASET="wikiann" +TASK_LAYER="task-adapters" +LANG="de" +OUTPUT_DIR="/users/zyong2/data/zyong2/bigscience/data/processed/021-wikiann/$(basename $MODEL_NAME)-${LANG}-FT-${TASK_LAYER}" # where you want to save checkpoints at +CACHE_DIR="/users/zyong2/data/zyong2/huggingface" # cache dir for saving/loading HF models and XNLI datasets. + mkdir -p $OUTPUT_DIR -python /users/zyong2/data/zyong2/bigscience/gh/multilingual-modeling/scripts/eval/eval.py \ +python ./scripts/eval/eval.py \ $OUTPUT_DIR \ --lang $LANG \ --cache_dir $CACHE_DIR \ --dataset $TASK_DATASET \ ---num_train_epochs 2 \ +--num_train_epochs 100 \ --learning_rate $LR \ --per_device_train_batch_size 8 \ ---gradient_accumulation_steps 1 \ +--gradient_accumulation_steps 4 \ --original_model $BIGS_MODEL \ ---adapted_model_dir $ADAPTER_MODEL_DIR \ ---madx_lang_adapter $MADX \ +--adapted_model_dir $MODEL_NAME \ --tokenizer $TOKENIZER_NAME \ --do_train \ ---do_predict +--do_predict \ +--task_layers $TASK_LAYER + +# --use_partial_data \ +# --use_partial_train_data 100 \ +# --use_partial_val_data 100 \ +# --use_partial_test_data 100 From 9cca8933af641a5578411e02d8af3e44e7fc0fb4 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 7 Jul 2022 09:46:43 -0400 Subject: [PATCH 139/142] tok_strategy adds overlap-replace --- scripts/lang_adapt/tokenized4clm_sampled.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lang_adapt/tokenized4clm_sampled.py b/scripts/lang_adapt/tokenized4clm_sampled.py index ae76a04..abab025 100644 --- a/scripts/lang_adapt/tokenized4clm_sampled.py +++ b/scripts/lang_adapt/tokenized4clm_sampled.py @@ -28,7 +28,7 @@ parser.add_argument('--lang', type=str, required=True) parser.add_argument('--model', type=str, required=True) parser.add_argument('--tokenizer_dir', type=str, required=True) -parser.add_argument('--tok_strategy', type=str, choices=["replace", "extend"] ,required=True) +parser.add_argument('--tok_strategy', type=str, choices=["replace", "extend", "overlap-replace"] ,required=True) parser.add_argument('--hf_cache_dir', default="~/.cache/huggingface/transformers", type=str) parser.add_argument('--vocab_size', default=24_000, type=int) parser.add_argument('--sample_size', default=100_000, type=int) From 031a14a71629735a8bae83658cf424afa3d81d83 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 7 Jul 2022 09:47:45 -0400 Subject: [PATCH 140/142] remove outdated code and support continual pretraining --- scripts/lang_adapt/madx_run_clm.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index c2284d7..0d73be4 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -103,9 +103,13 @@ class ModelArguments: "with private models)." }, ) + reinit_weights: bool = field( + default=False, + metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt'"}, + ) lang_adapt_strategies: str = field( default=None, - metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt'"}, + metadata={"help": "language adaptation strategies"}, ) embedding_strategies: str = field( default="", @@ -594,6 +598,7 @@ def get_adapter_config(adapter_args, model_args): elif model_args.embedding_strategies == "extend": original_embedding_layer = model.get_input_embeddings() original_vocab_size = original_embedding_layer.weight.shape[0] + print(f"Tokens for new languages: {len(tokenizer) - original_vocab_size}") model.resize_token_embeddings(len(tokenizer)) model.tie_weights() @@ -605,15 +610,9 @@ def zero_grad(grad): embedding_layer.weight.register_hook(lambda grad: zero_grad(grad)) - #if model_args.embedding_strategies == "overlap-replace": - # if not tokenizer.name_or_path == model_args.model_name_or_path: - # orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) - # model.add_embeddings('lng_emb', tokenizer, reference_embedding='default', reference_tokenizer=orig_tokenizer ) - # model._active_embedding = "lng_emb" - # model.delete_embeddings('default') - # model.tie_weights() - #elif model_args.embedding_strategies == "replace": - # model.resize_token_embeddings(len(tokenizer)) + if model_args.reinit_weights: + print(f"❗️ Reinitialize model's weights") + model.init_weights() trainable_params = 0 frozen_params = 0 @@ -629,6 +628,10 @@ def zero_grad(grad): elif model_args.lang_adapt_strategies == "bitfit": if 'bias' not in name: param.requires_grad = False + else: + param.requires_grad = True + elif model_args.lang_adapt_strategies == "continual-pretrain": + param.requires_grad = True if not param.requires_grad: print(f"🥶 Frozen layer '{name}'") @@ -659,7 +662,7 @@ def main(): training_args.data_dir = f'{training_args.output_dir}' - assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'lora', 'bitfit') + assert model_args.lang_adapt_strategies in ('continual-pretrain', 'emb', 'madx', 'emb-then-adpt', 'lora', 'bitfit') assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') # Setup logging From b0a23c5d991a0d213f224b1d1c872eaa630a96c0 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 7 Jul 2022 10:09:10 -0400 Subject: [PATCH 141/142] update xlsum --- scripts/eval/eval.py | 153 +++++++++++++++++++------------------------ 1 file changed, 67 insertions(+), 86 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 3dcd67e..5d431a9 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -2,6 +2,7 @@ import argparse import os import json +from tqdm import tqdm from datasets import load_dataset from datasets import load_metric @@ -79,10 +80,16 @@ trainer_no_task_adpt_class_mapping = {XNLI: Trainer, XLSUM: Seq2SeqTrainer, WIKIANN: Trainer} trainer_class_mapping = {XNLI: AdapterTrainer, XLSUM: Seq2SeqAdapterTrainer, WIKIANN: AdapterTrainer} trainer_args_mapping = {XNLI: TrainingArguments, XLSUM: Seq2SeqTrainingArguments, WIKIANN: TrainingArguments} -task_eval_metric_best_model = {XNLI: 'eval_accuracy', WIKIANN: 'eval_overall_f1'} +task_eval_metric_best_model = {XNLI: 'eval_accuracy', XLSUM: 'eval_loss', WIKIANN: 'eval_overall_f1'} args = parser.parse_args() +# XLSUM +XLSUM_INPUT_LEN = 512 +XLSUM_OUTPUT_LEN = 64 +XLSUM_NUM_BEAMS = 1 +XLSUM_LEN_PENALTY = 0.6 + #### Process args if not args.cross_lingual and not args.train_lang: args.train_lang = args.lang @@ -102,11 +109,11 @@ elif args.dataset == WIKIANN: optional_model_kwargs = {"num_labels": 7} elif args.dataset == XLSUM: - optional_trainer_args = {"generation_max_length": 512 + 64, + optional_trainer_args = {"generation_max_length": XLSUM_INPUT_LEN + XLSUM_OUTPUT_LEN, "predict_with_generate":True, "optim": "adafactor", "lr_scheduler_type": "linear", - "warmup_steps": 0} + "warmup_ratio": 0.1} if args.local_rank: torch.cuda.set_device(args.local_rank) @@ -161,11 +168,16 @@ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token +if tokenizer.sep_token is None: + tokenizer.sep_token = tokenizer.bos_token # TODO: we probably need better code for this than multiple if-else statements en_tokenizer = AutoTokenizer.from_pretrained(args.original_model, cache_dir=args.cache_dir, revision=args.revision, add_prefix_space=args.dataset in [WIKIANN]) if en_tokenizer.pad_token is None: en_tokenizer.pad_token = en_tokenizer.eos_token +if en_tokenizer.sep_token is None: + en_tokenizer.sep_token = en_tokenizer.bos_token + # en_tokenizer.add_special_tokens({'sep_token':'<|sep|>'}) if args.dataset == XNLI: if tokenizer.eos_token is None: @@ -184,30 +196,28 @@ def en_tokenize_function(examples): # also, unlike enc-dec model, we cannot feed the model some text and expect the model to generate only summary # we need to have input = [text] + [padding] and the output be [text] + [summary]. def tokenize_function(example): - text = tokenizer(f'{example["text"]}', max_length=511, truncation=True) - # TODO: sep_token instead of bos_token - input_text = tokenizer.decode(text['input_ids'], skip_special_tokens=True) + tokenizer.bos_token + text = tokenizer(f'{example["text"]}', max_length=XLSUM_INPUT_LEN - 1, padding="max_length", truncation=True) + input_text = tokenizer.decode(text['input_ids'], skip_special_tokens=False) + tokenizer.sep_token with tokenizer.as_target_tokenizer(): - summaries = tokenizer(f'{example["summary"]}', max_length=64, padding="max_length", truncation=True) - summaries_text = tokenizer.decode(summaries['input_ids'], skip_special_tokens=True) - - inputs = tokenizer(f'{input_text + summaries_text}', max_length=512 + 64, padding="max_length", truncation=True) + summaries = tokenizer(f'{example["summary"]}', max_length=XLSUM_OUTPUT_LEN, padding="max_length", truncation=True) + summaries_text = tokenizer.decode(summaries['input_ids'], skip_special_tokens=False) + inputs = tokenizer(f'{input_text + summaries_text}') inputs["labels"] = inputs["input_ids"] return inputs - def en_tokenize_function(example): - inputs = en_tokenizer(f'{example["text"]}', max_length=512, padding="max_length", truncation=True) + ... + # inputs = en_tokenizer(f'{example["text"]}', max_length=512, padding="max_length", truncation=True) - with en_tokenizer.as_target_tokenizer(): - summaries = en_tokenizer(f'{example["summary"]}', max_length=512, padding="max_length", truncation=True) + # with en_tokenizer.as_target_tokenizer(): + # summaries = en_tokenizer(f'{example["summary"]}', max_length=512, padding="max_length", truncation=True) - inputs["labels"] = summaries["input_ids"] + # inputs["labels"] = summaries["input_ids"] - return inputs + # return inputs elif args.dataset == WIKIANN: def tokenize_function(examples): @@ -285,61 +295,34 @@ def compute_metrics(eval_pred): metric = load_metric('rouge') def compute_metrics(eval_preds): - # TODO: note that this function calls trainer.model - preds, labels = eval_preds - - preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - - preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] - labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] - - result = metric.compute(predictions=preds, references=labels) - # TODO: need to confirm these are the right rouge values to report. Can report more ROUGE metrics if needed. - result = {key: value.mid.fmeasure * 100 for key, value in result.items()} - - return {k: round(v, 4) for k, v in result.items()} + return {} - def compute_beam_search_metrics(model, dataset): - input_ids = torch.Tensor(dataset['input_ids']).type(torch.IntTensor) - model.cuda() - print(input_ids.shape) - print(model.device) + def compute_xlsum_beam_search_metrics(model, dataset): + # get input sentences + input_ids = torch.Tensor(dataset['input_ids']).type(torch.IntTensor)[:, :XLSUM_INPUT_LEN] + bsz = args.per_device_eval_batch_size - beam_scorer = BeamSearchScorer( - batch_size=2, - num_beams=4, - device=model.device, - ) - - # instantiate logits processors - logits_processor = LogitsProcessorList( - [ - ForcedEOSTokenLogitsProcessor(512+64, eos_token_id=model.config.eos_token_id), - ] - ) + # get generated summaries + preds = list() + for i in tqdm(range(0, input_ids.shape[0], bsz), desc="Summarization task: generation"): + outputs = model.generate(input_ids[i:i+bsz], max_length=XLSUM_INPUT_LEN+XLSUM_OUTPUT_LEN, length_penalty=XLSUM_LEN_PENALTY, num_beams=XLSUM_NUM_BEAMS) + preds += tokenizer.batch_decode(outputs[:, XLSUM_INPUT_LEN:], skip_special_tokens=True) - preds = model.beam_search(input_ids[:2, :512].repeat_interleave(4, dim=0).cuda(), beam_scorer, logits_processor=logits_processor) - preds = tokenizer.batch_decode(preds) - print(preds) - assert False - labels = np.array(dataset['input_ids'])[:2, 512:] + # get gold summaries + labels = np.array(dataset['input_ids'])[:, XLSUM_INPUT_LEN:] labels = np.where(labels != -100, labels, tokenizer.pad_token_id) labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + print(preds) + print(labels) + + # compute ROUGE metrics preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] result = metric.compute(predictions=preds, references=labels) - print(result) - # print(preds) - # labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - # labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + result = {key: value.mid.fmeasure * 100 for key, value in result.items()} - # preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in preds] - # labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels] - - # result = metric.compute(predictions=preds, references=labels) + return {k: round(v, 4) for k, v in result.items()} else: raise ValueError("Unknown dataset provided") @@ -414,10 +397,12 @@ def load_task_specific_adapters(args, model, inference=False): cache_dir=args.cache_dir, revision=args.revision, **optional_model_kwargs) + if not inference: model.add_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") model.train_adapter(f"{args.dataset.split('/')[-1]}-task-adapter") return model + else: print(f"[Evaluation] Load task adapters from {args.pretrained_adapters_dir}/{args.dataset.split('/')[-1]}-task-adapter") adapter_name = model.load_adapter(f"{args.pretrained_adapters_dir}/{args.dataset.split('/')[-1]}-task-adapter") @@ -460,13 +445,14 @@ def load_language_adapters(args, model): model = make_base_model_trainable(args, model, inference) return model + # load unadapted model model = model_class_mapping[args.dataset].from_pretrained(args.original_model, pad_token_id=pad_token_id, cache_dir=args.cache_dir, revision=args.revision, **optional_model_kwargs) - # adapted models + # load adapted model if not args.cross_lingual or inference: model = load_embedding_layers(args, tokenizer, model) if args.madx_lang_adapter: @@ -534,32 +520,27 @@ def load_language_adapters(args, model): if args.dataset == XLSUM: # use beam search to get the results following the XLSUM paper - compute_beam_search_metrics(model, test_dataset) - assert False - - data_collator = DataCollatorForSeq2Seq( - tokenizer, - model=model, - label_pad_token_id=-100, - pad_to_multiple_of=8 if training_args.fp16 else None, - ) - - if model.active_adapters is None: - logger.info("No active adapters") - trainer_class = trainer_no_task_adpt_class_mapping[args.dataset] + print(f"Evaluating on test set ({XLSUM})...") + result = compute_xlsum_beam_search_metrics(model, test_dataset) + print(result) + else: - trainer_class = trainer_class_mapping[args.dataset] + if model.active_adapters is None: + logger.info("No active adapters") + trainer_class = trainer_no_task_adpt_class_mapping[args.dataset] + else: + trainer_class = trainer_class_mapping[args.dataset] - eval_trainer = trainer_class( - model=model, - args=training_args, - eval_dataset=test_dataset, - compute_metrics=compute_metrics, - # args for xlsum only - **{"data_collator": data_collator} if args.dataset == XLSUM else {} + eval_trainer = trainer_class( + model=model, + args=training_args, + eval_dataset=test_dataset, + compute_metrics=compute_metrics, + # args for xlsum only + **{"data_collator": data_collator} if args.dataset == XLSUM else {} - ) + ) - print("Evaluating on test set...") - print(eval_trainer.evaluate()) + print("Evaluating on test set...") + print(eval_trainer.evaluate()) From c7e1e6f2c0f572427a32e3ebee7d260edb365704 Mon Sep 17 00:00:00 2001 From: yongzx Date: Thu, 7 Jul 2022 10:39:48 -0400 Subject: [PATCH 142/142] fix tokenization --- scripts/eval/eval.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 5d431a9..f08ea1a 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -194,17 +194,17 @@ def en_tokenize_function(examples): elif args.dataset == XLSUM: # for decoder only structure, input and target needs to have the same length # also, unlike enc-dec model, we cannot feed the model some text and expect the model to generate only summary - # we need to have input = [text] + [padding] and the output be [text] + [summary]. + # we need to train the model with [text] + [sep] + [summary]. def tokenize_function(example): - text = tokenizer(f'{example["text"]}', max_length=XLSUM_INPUT_LEN - 1, padding="max_length", truncation=True) - input_text = tokenizer.decode(text['input_ids'], skip_special_tokens=False) + tokenizer.sep_token - + inputs = tokenizer(f'{example["text"]}', max_length=XLSUM_INPUT_LEN, padding="max_length", truncation=True) + inputs['input_ids'][-1] = tokenizer.sep_token_id + with tokenizer.as_target_tokenizer(): summaries = tokenizer(f'{example["summary"]}', max_length=XLSUM_OUTPUT_LEN, padding="max_length", truncation=True) - summaries_text = tokenizer.decode(summaries['input_ids'], skip_special_tokens=False) - inputs = tokenizer(f'{input_text + summaries_text}') - inputs["labels"] = inputs["input_ids"] + inputs['input_ids'] += summaries['input_ids'] + inputs['attention_mask'] += summaries['attention_mask'] + inputs['labels'] = inputs['input_ids'] return inputs @@ -299,6 +299,7 @@ def compute_metrics(eval_preds): def compute_xlsum_beam_search_metrics(model, dataset): # get input sentences + # print(torch.Tensor(dataset['input_ids']).type(torch.IntTensor)) input_ids = torch.Tensor(dataset['input_ids']).type(torch.IntTensor)[:, :XLSUM_INPUT_LEN] bsz = args.per_device_eval_batch_size