From a2000c34cfc7bcc53493c90146f4b649b78df4e1 Mon Sep 17 00:00:00 2001 From: Omobayode Fagbohungbe Date: Wed, 3 Sep 2025 11:55:00 -0400 Subject: [PATCH 1/3] fix: optimize the data_handling for DQ Signed-off-by: Omobayode Fagbohungbe --- fms_mo/dq.py | 2 +- fms_mo/utils/calib_data.py | 112 ++++++++++++++++++++++++------------- fms_mo/utils/eval_utils.py | 6 +- 3 files changed, 77 insertions(+), 43 deletions(-) diff --git a/fms_mo/dq.py b/fms_mo/dq.py index eb49bc30..44caf2e6 100644 --- a/fms_mo/dq.py +++ b/fms_mo/dq.py @@ -287,7 +287,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args): eval_llm_1GPU(qcfg, model, test_dataset) else: model.to(torch.device("cuda:0")) - n_samples = int(test_dataset.input_ids.shape[1] / block_size) + n_samples = int(test_dataset["input_ids"].shape[1] / block_size) evaluator = Evaluator(test_dataset, "cuda", n_samples=n_samples) with patch_torch_bmm(qcfg): ppl = evaluator.evaluate(model, block_size=block_size) diff --git a/fms_mo/utils/calib_data.py b/fms_mo/utils/calib_data.py index a61d3a76..43d829c9 100755 --- a/fms_mo/utils/calib_data.py +++ b/fms_mo/utils/calib_data.py @@ -31,23 +31,26 @@ import torch -def return_tokenized_samples(nsamples, trainenc, seqlen, sequential=False): +def return_tokenized_samples(nsamples, trainenc, seqlen, sequential=False) -> dict[str, torch.int]: """Randomly crop nsamples sequence from trainenc, each with the length of seqlen. see below functions, e.g. get_wikitext2() for more details. """ - traindataset = [] + traindataset = { + "input_ids": torch.zeros(size = (nsamples, seqlen), dtype = torch.int), + "attention_mask": torch.zeros(size = (nsamples, seqlen), dtype = torch.int) + } i = 0 - for _ in range(nsamples): + for k in range(nsamples): if not sequential: i = random.randint(0, len(trainenc.input_ids) - seqlen - 1) j = i + seqlen inp = trainenc.input_ids[i:j] mask = trainenc.attention_mask[i:j] - traindataset.append( - {"input_ids": torch.tensor(inp), "attention_mask": torch.tensor(mask)} - ) + traindataset["input_ids"][k] = torch.tensor(inp) + traindataset["attention_mask"][k] = torch.tensor(mask) + i = j return traindataset @@ -83,11 +86,15 @@ def get_wikitext2( traindataset = return_tokenized_samples( nsamples, trainenc, seqlen, sequential=sequential ) + testenc = { + "input_ids": testenc["input_ids"], + "attention_mask": testenc["attention_mask"] + } return traindataset, testenc -def get_ptb(nsamples, seed, seqlen, model, sequential=False, gptq_style=False): +def get_ptb(nsamples, seed, seqlen, tokenizer, sequential=False, gptq_style=False): """Prepare data for GPTQ using PTB dataset. Args: @@ -102,8 +109,6 @@ def get_ptb(nsamples, seed, seqlen, model, sequential=False, gptq_style=False): """ random.seed(seed) - tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) - traindata = load_dataset("ptb_text_only", "penn_treebank", split="train") valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation") if gptq_style: @@ -112,9 +117,13 @@ def get_ptb(nsamples, seed, seqlen, model, sequential=False, gptq_style=False): traindata = "\n\n".join(traindata["sentence"]) trainenc = tokenizer(traindata) - testenc = tokenizer("\n\n".join(valdata["sentence"])) + testenc = tokenizer("\n\n".join(valdata["sentence"]),return_tensors="pt") traindataset = return_tokenized_samples(nsamples, trainenc, seqlen, sequential) + testenc = { + "input_ids": testenc["input_ids"], + "attention_mask": testenc["attention_mask"] + } return traindataset, testenc @@ -144,8 +153,13 @@ def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False): split="validation", ) - trainloader = [] - for _ in range(nsamples): + testenc = tokenizer("\n\n".join(valdata["text"]),return_tensors="pt") + + trainloader ={ + "input_ids": torch.zeros(size = (nsamples, seqlen), dtype = torch.int), + "attention_mask": torch.zeros(size = (nsamples, seqlen), dtype = torch.int) + } + for k in range(nsamples): while True: i = random.randint(0, len(traindata) - 1) trainenc = tokenizer(traindata[i]["text"]) @@ -156,14 +170,14 @@ def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False): j = i + seqlen inp = trainenc.input_ids[i:j] mask = trainenc.attention_mask[i:j] - trainloader.append({"input_ids": inp, "attention_mask": mask}) + trainloader["input_ids"][k] = torch.tensor(inp) + trainloader["attention_mask"][k] = torch.tensor(mask) j = i - testdataset = [ - { - "input_ids": torch.tensor(valdata.input_ids), - "attention_mask": torch.tensor(valdata.attention_mask), - } - ] + + testdataset = { + "input_ids": testenc["input_ids"], + "attention_mask": testenc["attention_mask"], + } return trainloader, testdataset @@ -229,22 +243,34 @@ def get_self_instruct_starcoder( cr_dataset = load_dataset("codeparrot/self-instruct-starcoder", split=split_name) eval_dataset = tokenizer(" ".join(cr_dataset[:]["output"]), return_tensors="pt") + eval_dataset = { + "input_ids": eval_dataset["input_ids"], + "attention_mask": eval_dataset["attention_mask"] + } + cr_dataset.shuffle(seed) nsamples = min(nsamples, len(cr_dataset)) - trainloader = [] - for i in range(nsamples): - tokenized = tokenizer(cr_dataset[i]["output"], return_tensors="pt") - trainloader.append( - { - "input_ids": tokenized.input_ids.squeeze(0), - "attention_mask": tokenized.attention_mask.squeeze(0), - } + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + trainloader = { + "input_ids": torch.zeros(size = (nsamples,seqlen), dtype=torch.int), + "attention_mask": torch.zeros(size = (nsamples,seqlen), dtype=torch.int) + } + for k in range(nsamples): + tokenized = tokenizer( + cr_dataset[k]["output"], return_tensors="pt", + padding="max_length", max_length = seqlen ) + trainloader["input_ids"][k] = tokenized.input_ids.squeeze(0) + trainloader["attention_mask"][k] = tokenized.attention_mask.squeeze(0) + return trainloader, eval_dataset def get_cobol_java_supervised( - nsamples, seed, model, seqlen=8192, split_name="both", file_path=None + nsamples, seed, seqlen=8192, tokenizer = "", split_name="both", file_path=None ): """Prepare data for GPTQ using cobol/java dataset. @@ -265,13 +291,21 @@ def get_cobol_java_supervised( raw_data = f.readlines() data_dict_array = [json.loads(line) for line in raw_data] - random.shuffle(data_dict_array) - tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) + eval_dataset = tokenizer(data_dict_array["content"], return_tensors="pt") + eval_dataset = { + "input_ids": eval_dataset["input_ids"], + "attention_mask": eval_dataset["attention_mask"] + } + + random.shuffle(data_dict_array) nsamples = min(nsamples, len(data_dict_array)) - trainloader = [] + trainloader = { + "input_ids": torch.zeros(size = (nsamples,seqlen), dtype=torch.int), + "attention_mask": torch.zeros(size = (nsamples,seqlen), dtype=torch.int) + } added_ex = 0 while added_ex < nsamples: @@ -300,16 +334,12 @@ def get_cobol_java_supervised( inputs = inputs[i:j] tokenized = tokenizer(inputs, return_tensors="pt") - trainloader.append( - { - "input_ids": tokenized.input_ids, - "attention_mask": tokenized.attention_mask, - } - ) + trainloader["input_ids"][added_ex] = tokenized.input_ids.squeeze(0) + trainloader["attention_mask"][added_ex] = tokenized.attention_mask.squeeze(0) added_ex += 1 - return trainloader, None + return trainloader, eval_dataset def get_tokenized_data( @@ -390,6 +420,10 @@ def get_tokenized_data( traindataset, testdataset = get_self_instruct_starcoder( nsamples, seed, seqlen, tokenizer, split_name="curated" ) + elif "java" in name: + traindataset, testdataset = get_cobol_java_supervised( + nsamples, seed, seqlen, tokenizer, + ) else: raise NotImplementedError( f"Dataset {name} is not implemented yet. Please refer to get_wikitext2() and implement" @@ -397,7 +431,7 @@ def get_tokenized_data( ) if path_to_save: - datasets.Dataset.from_list(traindataset).save_to_disk(path_to_save + "_train") + datasets.Dataset.from_dict(traindataset).save_to_disk(path_to_save + "_train") if isinstance(testdataset, BatchEncoding): if not os.path.exists(path_to_save + "_test"): os.mkdir(path_to_save + "_test") diff --git a/fms_mo/utils/eval_utils.py b/fms_mo/utils/eval_utils.py index 774d74e0..55d0e464 100644 --- a/fms_mo/utils/eval_utils.py +++ b/fms_mo/utils/eval_utils.py @@ -45,7 +45,7 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): # qcfg["batch_size"] = 1 # for dataloading, always use batch_size of 1 qcfg["dtype"] = next(iter(model.parameters())).dtype seq_len = qcfg["seq_len"] - qcfg["n_samples"] = int(test_dataset.input_ids.shape[1] / seq_len) + qcfg["n_samples"] = int(test_dataset["input_ids"].shape[1] / seq_len) # --- Phase 0 cache the inputs of the block0--- use_cache = model.config.use_cache model.config.use_cache = False @@ -116,7 +116,7 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): # # Shift so that tokens < n predict n shift_logits = lm_logits[:, :-1, :].contiguous().float() - shift_labels = test_dataset.input_ids[:, (i * seq_len) : ((i + 1) * seq_len)][ + shift_labels = test_dataset["input_ids"][:, (i * seq_len) : ((i + 1) * seq_len)][ :, 1: ].to(dev) loss_fct = nn.CrossEntropyLoss() @@ -144,7 +144,7 @@ def __init__(self, dataset, device, n_samples=160): self.dataset = dataset self.device = device # loading tokenized dataset. - self.dataset = dataset.input_ids.to(device) + self.dataset = dataset['input_ids'].to(device) self.n_samples = n_samples @torch.no_grad() From cfb9ea9b31f59725a4e68b14d3e828ace18ad48e Mon Sep 17 00:00:00 2001 From: Omobayode Fagbohungbe Date: Wed, 3 Sep 2025 13:26:19 -0400 Subject: [PATCH 2/3] fix: corrected the trailing spaces Signed-off-by: Omobayode Fagbohungbe --- fms_mo/utils/calib_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fms_mo/utils/calib_data.py b/fms_mo/utils/calib_data.py index 43d829c9..0301f1a6 100755 --- a/fms_mo/utils/calib_data.py +++ b/fms_mo/utils/calib_data.py @@ -26,7 +26,7 @@ # Third Party from datasets import load_dataset, load_from_disk -from transformers import AutoTokenizer, BatchEncoding +from transformers import BatchEncoding import datasets import torch @@ -260,12 +260,12 @@ def get_self_instruct_starcoder( } for k in range(nsamples): tokenized = tokenizer( - cr_dataset[k]["output"], return_tensors="pt", + cr_dataset[k]["output"], return_tensors="pt", padding="max_length", max_length = seqlen ) trainloader["input_ids"][k] = tokenized.input_ids.squeeze(0) trainloader["attention_mask"][k] = tokenized.attention_mask.squeeze(0) - + return trainloader, eval_dataset From a0f74da33449f3f13dcb9c91353a053fa3938dcf Mon Sep 17 00:00:00 2001 From: Omobayode Fagbohungbe Date: Mon, 8 Sep 2025 13:47:26 -0400 Subject: [PATCH 3/3] fix: adding hints to the arguments and returns Signed-off-by: Omobayode Fagbohungbe --- fms_mo/utils/calib_data.py | 108 +++++++++++++++++++++++-------------- fms_mo/utils/eval_utils.py | 8 +-- 2 files changed, 71 insertions(+), 45 deletions(-) diff --git a/fms_mo/utils/calib_data.py b/fms_mo/utils/calib_data.py index 0301f1a6..1df6698d 100755 --- a/fms_mo/utils/calib_data.py +++ b/fms_mo/utils/calib_data.py @@ -31,13 +31,15 @@ import torch -def return_tokenized_samples(nsamples, trainenc, seqlen, sequential=False) -> dict[str, torch.int]: +def return_tokenized_samples( + nsamples: int, trainenc: list, seqlen: int, sequential: bool = False +) -> dict: """Randomly crop nsamples sequence from trainenc, each with the length of seqlen. see below functions, e.g. get_wikitext2() for more details. """ traindataset = { - "input_ids": torch.zeros(size = (nsamples, seqlen), dtype = torch.int), - "attention_mask": torch.zeros(size = (nsamples, seqlen), dtype = torch.int) + "input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), + "attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), } i = 0 @@ -57,8 +59,13 @@ def return_tokenized_samples(nsamples, trainenc, seqlen, sequential=False) -> di def get_wikitext2( - nsamples, seed, seqlen, tokenizer, sequential=False, gptq_style=False -): + nsamples: int, + seed: int, + seqlen: int, + tokenizer: str, + sequential: bool = False, + gptq_style: bool = False, +) -> tuple[dict, dict]: """Prepare data for GPTQ using wikitext2 dataset. Args: @@ -87,14 +94,21 @@ def get_wikitext2( nsamples, trainenc, seqlen, sequential=sequential ) testenc = { - "input_ids": testenc["input_ids"], - "attention_mask": testenc["attention_mask"] + "input_ids": testenc["input_ids"], + "attention_mask": testenc["attention_mask"], } return traindataset, testenc -def get_ptb(nsamples, seed, seqlen, tokenizer, sequential=False, gptq_style=False): +def get_ptb( + nsamples: int, + seed: int, + seqlen: int, + tokenizer: str, + sequential: bool = False, + gptq_style: bool = False, +) -> tuple[dict, dict]: """Prepare data for GPTQ using PTB dataset. Args: @@ -117,18 +131,20 @@ def get_ptb(nsamples, seed, seqlen, tokenizer, sequential=False, gptq_style=Fals traindata = "\n\n".join(traindata["sentence"]) trainenc = tokenizer(traindata) - testenc = tokenizer("\n\n".join(valdata["sentence"]),return_tensors="pt") + testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt") traindataset = return_tokenized_samples(nsamples, trainenc, seqlen, sequential) testenc = { - "input_ids": testenc["input_ids"], - "attention_mask": testenc["attention_mask"] + "input_ids": testenc["input_ids"], + "attention_mask": testenc["attention_mask"], } return traindataset, testenc -def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False): +def get_c4_train( + nsamples: int, seed: int, seqlen: int, tokenizer: str, sequential: bool = False +) -> tuple[dict, dict]: """Prepare data for GPTQ using C4 dataset. Args: @@ -153,11 +169,11 @@ def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False): split="validation", ) - testenc = tokenizer("\n\n".join(valdata["text"]),return_tensors="pt") + testenc = tokenizer("\n\n".join(valdata["text"]), return_tensors="pt") - trainloader ={ - "input_ids": torch.zeros(size = (nsamples, seqlen), dtype = torch.int), - "attention_mask": torch.zeros(size = (nsamples, seqlen), dtype = torch.int) + trainloader = { + "input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), + "attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), } for k in range(nsamples): while True: @@ -182,7 +198,7 @@ def get_c4_train(nsamples, seed, seqlen, tokenizer, sequential=False): return trainloader, testdataset -def get_c4_new(nsamples, seed, seqlen, tokenizer): +def get_c4_new(nsamples: int, seed: int, seqlen: int, tokenizer: str): """Prepare data for GPTQ using C4 dataset. Args: @@ -227,8 +243,8 @@ def get_c4_new(nsamples, seed, seqlen, tokenizer): def get_self_instruct_starcoder( - nsamples, seed, seqlen, tokenizer, split_name="curated" -): # pylint: disable=unused-argument + nsamples: int, seed: int, seqlen: int, tokenizer: str, split_name: str = "curated" +) -> tuple[dict, dict]: # pylint: disable=unused-argument """Prepare data for GPTQ using starcoder dataset. Args: @@ -244,8 +260,8 @@ def get_self_instruct_starcoder( eval_dataset = tokenizer(" ".join(cr_dataset[:]["output"]), return_tensors="pt") eval_dataset = { - "input_ids": eval_dataset["input_ids"], - "attention_mask": eval_dataset["attention_mask"] + "input_ids": eval_dataset["input_ids"], + "attention_mask": eval_dataset["attention_mask"], } cr_dataset.shuffle(seed) @@ -255,13 +271,15 @@ def get_self_instruct_starcoder( tokenizer.pad_token = tokenizer.eos_token trainloader = { - "input_ids": torch.zeros(size = (nsamples,seqlen), dtype=torch.int), - "attention_mask": torch.zeros(size = (nsamples,seqlen), dtype=torch.int) + "input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), + "attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), } for k in range(nsamples): tokenized = tokenizer( - cr_dataset[k]["output"], return_tensors="pt", - padding="max_length", max_length = seqlen + cr_dataset[k]["output"], + return_tensors="pt", + padding="max_length", + max_length=seqlen, ) trainloader["input_ids"][k] = tokenized.input_ids.squeeze(0) trainloader["attention_mask"][k] = tokenized.attention_mask.squeeze(0) @@ -270,8 +288,13 @@ def get_self_instruct_starcoder( def get_cobol_java_supervised( - nsamples, seed, seqlen=8192, tokenizer = "", split_name="both", file_path=None -): + nsamples: int, + seed: int, + seqlen: int = 8192, + tokenizer: str = "", + split_name: str = "both", + file_path: str = None, +) -> tuple[dict, dict]: """Prepare data for GPTQ using cobol/java dataset. Args: @@ -294,8 +317,8 @@ def get_cobol_java_supervised( eval_dataset = tokenizer(data_dict_array["content"], return_tensors="pt") eval_dataset = { - "input_ids": eval_dataset["input_ids"], - "attention_mask": eval_dataset["attention_mask"] + "input_ids": eval_dataset["input_ids"], + "attention_mask": eval_dataset["attention_mask"], } random.shuffle(data_dict_array) @@ -303,8 +326,8 @@ def get_cobol_java_supervised( nsamples = min(nsamples, len(data_dict_array)) trainloader = { - "input_ids": torch.zeros(size = (nsamples,seqlen), dtype=torch.int), - "attention_mask": torch.zeros(size = (nsamples,seqlen), dtype=torch.int) + "input_ids": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), + "attention_mask": torch.zeros(size=(nsamples, seqlen), dtype=torch.int), } added_ex = 0 @@ -343,15 +366,15 @@ def get_cobol_java_supervised( def get_tokenized_data( - name, - nsamples=128, - seqlen=2048, - tokenizer="", - seed=0, - gptq_style=False, - path_to_save=None, - field_name=None, -): + name: str, + nsamples: int = 128, + seqlen: int = 2048, + tokenizer: str = "", + seed: int = 0, + gptq_style: bool = False, + path_to_save: str = None, + field_name: str = None, +) -> tuple[dict, dict]: """Convenient function to get data. Default to get_wikitext2.""" # Option 1: User provide a dataset from disk, only need to tokenize and format it. @@ -422,7 +445,10 @@ def get_tokenized_data( ) elif "java" in name: traindataset, testdataset = get_cobol_java_supervised( - nsamples, seed, seqlen, tokenizer, + nsamples, + seed, + seqlen, + tokenizer, ) else: raise NotImplementedError( diff --git a/fms_mo/utils/eval_utils.py b/fms_mo/utils/eval_utils.py index 55d0e464..f2ea858f 100644 --- a/fms_mo/utils/eval_utils.py +++ b/fms_mo/utils/eval_utils.py @@ -116,9 +116,9 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): # # Shift so that tokens < n predict n shift_logits = lm_logits[:, :-1, :].contiguous().float() - shift_labels = test_dataset["input_ids"][:, (i * seq_len) : ((i + 1) * seq_len)][ - :, 1: - ].to(dev) + shift_labels = test_dataset["input_ids"][ + :, (i * seq_len) : ((i + 1) * seq_len) + ][:, 1:].to(dev) loss_fct = nn.CrossEntropyLoss() loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) @@ -144,7 +144,7 @@ def __init__(self, dataset, device, n_samples=160): self.dataset = dataset self.device = device # loading tokenized dataset. - self.dataset = dataset['input_ids'].to(device) + self.dataset = dataset["input_ids"].to(device) self.n_samples = n_samples @torch.no_grad()