diff --git a/experiments/Accuracy_checking/Pixtral_12B.py b/experiments/Accuracy_checking/Pixtral_12B.py new file mode 100644 index 0000000..3d4e233 --- /dev/null +++ b/experiments/Accuracy_checking/Pixtral_12B.py @@ -0,0 +1,173 @@ +from unsloth import FastVisionModel +import torch +from unsloth import is_bf16_supported +from unsloth.trainer import UnslothVisionDataCollator +from trl import SFTTrainer, SFTConfig +from datasets import load_dataset +import numpy as np +from tqdm import tqdm +from bert_score import score +from transformers import TextStreamer + +def load_model(): + model, tokenizer = FastVisionModel.from_pretrained( + "unsloth/Pixtral-12B-2409", + load_in_4bit = True, + use_gradient_checkpointing = "unsloth", + ) + model = FastVisionModel.get_peft_model( + model, + finetune_vision_layers = False, + finetune_language_layers = True, + finetune_attention_modules = False, + finetune_mlp_modules = True, + + r = 8, + lora_alpha = 8, + lora_dropout = 0, + bias = "none", + random_state = 3407, + use_rslora = False, + loftq_config = None, + ) + + return model, tokenizer + +def prep_train_dataset(num_img): + + instruction = "Write the LaTeX representation for this image." + def convert_to_conversation(sample): + conversation = [ + { "role": "user", + "content" : [ + {"type" : "text", "text" : instruction}, + {"type" : "image", "image" : sample["image"]} ] + }, + { "role" : "assistant", + "content" : [ + {"type" : "text", "text" : sample["text"]} ] + }, + ] + return { "messages" : conversation } + + dataset = load_dataset("unsloth/LaTeX_OCR", split = "train") + + dataset = dataset.select(range(num_img)) + converted_dataset = [convert_to_conversation(sample) for sample in dataset] + return converted_dataset + +def prep_train_model(model, tokenizer, dataset, num_epochs = 5): + FastVisionModel.for_training(model) + + trainer = SFTTrainer( + model = model, + tokenizer = tokenizer, + data_collator = UnslothVisionDataCollator(model, tokenizer), + train_dataset = dataset, + args = SFTConfig( + per_device_train_batch_size = 1, + gradient_accumulation_steps = 4, + warmup_steps = 5, + # max_steps = 30, + num_train_epochs = num_epochs, + learning_rate = 2e-4, + fp16 = not is_bf16_supported(), + bf16 = is_bf16_supported(), + logging_steps = 1, + optim = "paged_adamw_8bit", + weight_decay = 0.01, + lr_scheduler_type = "linear", + seed = 3407, + output_dir = "outputs", + report_to = "none", + + remove_unused_columns = False, + dataset_text_field = "", + dataset_kwargs = {"skip_prepare_dataset": True}, + dataset_num_proc = 4, + max_seq_length = 2048, + ), + ) + return trainer + +def start_mem(): + gpu_stats = torch.cuda.get_device_properties(0) + start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) + max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) + return start_gpu_memory, max_memory + +def end_mem(start_gpu_memory, max_memory): + used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) + used_memory_for_lora = round(used_memory - start_gpu_memory, 3) + used_percentage = round(used_memory / max_memory * 100, 3) + lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) + return used_memory, used_memory_for_lora, used_percentage, lora_percentage + + +def get_response(test_dataset, model, tokenizer, n): + FastVisionModel.for_inference(model) + + image = test_dataset[n]["image"] + instruction = "Is there something interesting about this image?" + + messages = [ + {"role": "user", "content": [ + {"type": "image"}, + {"type": "text", "text": instruction} + ]} + ] + input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) + inputs = tokenizer( + image, + input_text, + add_special_tokens=False, + return_tensors="pt", + ).to("cuda") + + if "token_type_ids" in inputs: + del inputs["token_type_ids"] + + text_streamer = TextStreamer(tokenizer, skip_prompt=True) + output_id = model.generate(**inputs, streamer=text_streamer, max_new_tokens=64, + use_cache=True, temperature=1.5, min_p=0.1) + response = tokenizer.decode(output_id[0], skip_special_tokens=True) + return response + +def evaluate(response,reference): + P, R, F1 = score([response], [reference], model_type='bert-base-uncased', lang='en') + return { + "precision": P.mean().item(), + "recall": R.mean().item(), + "f1": F1.mean().item() + } + +def main(): + model , tokenizer = load_model() + train_dataset = prep_train_dataset(100) + trainer = prep_train_model(model, tokenizer, train_dataset) + start_gpu_memory, max_memory = start_mem() + trainer.train() + (used_memory, used_memory_for_lora, used_percentage, lora_percentage) = end_mem(start_gpu_memory, max_memory) + test_dataset = load_dataset("unsloth/LaTeX_OCR", split = "test") + response_dict = {"precision": 0, "recall": 0, "f1": 0} + + for img in range(10): + response = get_response(test_dataset, model, tokenizer, img) + reference = test_dataset[img]["text"] + accuracy = evaluate(response, reference) + + response_dict["precision"] += accuracy["precision"] + response_dict["recall"] += accuracy["recall"] + response_dict["f1"] += accuracy["f1"] + + precision = response_dict["precision"] / 10 + recall = response_dict["recall"] / 10 + f1 = response_dict["f1"] / 10 + + print(f"Used Memory: {used_memory} GB") + print(f"Used Memory for LoRA: {used_memory_for_lora} GB") + + print(f"VQA Accuracy: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/Accuracy_checking/Qwen_7B.py b/experiments/Accuracy_checking/Qwen_7B.py new file mode 100644 index 0000000..af86479 --- /dev/null +++ b/experiments/Accuracy_checking/Qwen_7B.py @@ -0,0 +1,172 @@ +from unsloth import FastVisionModel +import torch +from unsloth import is_bf16_supported +from unsloth.trainer import UnslothVisionDataCollator +from trl import SFTTrainer, SFTConfig +from datasets import load_dataset +from bert_score import score +import numpy as np +from transformers import TextStreamer + +def load_model(): + model, tokenizer = FastVisionModel.from_pretrained( + "unsloth/Qwen2-VL-7B-Instruct", + load_in_4bit = True, + use_gradient_checkpointing = "unsloth", + ) + + model = FastVisionModel.get_peft_model( + model, + finetune_vision_layers = True, + finetune_language_layers = True, + finetune_attention_modules = True, + finetune_mlp_modules = True, + + r = 16, + lora_alpha = 16, + lora_dropout = 0, + bias = "none", + random_state = 3407, + use_rslora = False, + loftq_config = None, + ) + + return model, tokenizer + +def prep_train_dataset(num_img): + + instruction = "Write the LaTeX representation for this image." + def convert_to_conversation(sample): + conversation = [ + { "role": "user", + "content" : [ + {"type" : "text", "text" : instruction}, + {"type" : "image", "image" : sample["image"]} ] + }, + { "role" : "assistant", + "content" : [ + {"type" : "text", "text" : sample["text"]} ] + }, + ] + return { "messages" : conversation } + + dataset = load_dataset("unsloth/LaTeX_OCR", split = "train") + + dataset = dataset.select(range(num_img)) + converted_dataset = [convert_to_conversation(sample) for sample in dataset] + return converted_dataset + +def prep_train_model(model, tokenizer, converted_dataset, num_epochs = 5): + FastVisionModel.for_training(model) + + trainer = SFTTrainer( + model = model, + tokenizer = tokenizer, + data_collator = UnslothVisionDataCollator(model, tokenizer), + train_dataset = converted_dataset, + args = SFTConfig( + per_device_train_batch_size = 2, + gradient_accumulation_steps = 4, + warmup_steps = 5, + # max_steps = 30, + num_train_epochs = num_epochs, + learning_rate = 2e-4, + fp16 = not is_bf16_supported(), + bf16 = is_bf16_supported(), + logging_steps = 1, + optim = "adamw_8bit", + weight_decay = 0.01, + lr_scheduler_type = "linear", + seed = 3407, + output_dir = "outputs", + report_to = "none", + + remove_unused_columns = False, + dataset_text_field = "", + dataset_kwargs = {"skip_prepare_dataset": True}, + dataset_num_proc = 4, + max_seq_length = 2048, + ), + ) + return trainer + +def start_mem(): + gpu_stats = torch.cuda.get_device_properties(0) + start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) + max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) + return start_gpu_memory, max_memory + +def end_mem(start_gpu_memory, max_memory): + used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) + used_memory_for_lora = round(used_memory - start_gpu_memory, 3) + used_percentage = round(used_memory / max_memory * 100, 3) + lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) + return used_memory, used_memory_for_lora, used_percentage, lora_percentage + +def get_response(test_dataset, model, tokenizer, n): + FastVisionModel.for_inference(model) + + image = test_dataset[n]["image"] + instruction = "Write the LaTeX representation for this image." + + messages = [ + {"role": "user", "content": [ + {"type": "image"}, + {"type": "text", "text": instruction} + ]} + ] + input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True) + inputs = tokenizer( + image, + input_text, + add_special_tokens = False, + return_tensors = "pt", + ).to("cuda") + + text_streamer = TextStreamer(tokenizer, skip_prompt = True) + output_id = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128, + use_cache = True, temperature = 1.5, min_p = 0.1) + response = tokenizer.decode(output_id[0], skip_special_tokens = True) + return response + +def evaluate(response,reference): + P, R, F1 = score([response], [reference], model_type='bert-base-uncased', lang='en') + + return { + "precision": P.mean().item(), + "recall": R.mean().item(), + "f1": F1.mean().item() + } + + +def main(): + model , tokenizer = load_model() + train_dataset = prep_train_dataset(100) + trainer = prep_train_model(model, tokenizer, train_dataset) + start_gpu_memory, max_memory = start_mem() + trainer.train() + (used_memory, used_memory_for_lora, used_percentage, lora_percentage) = end_mem(start_gpu_memory, max_memory) + end_mem(start_gpu_memory, max_memory) + test_dataset = load_dataset("unsloth/LaTeX_OCR", split = "test") + response_dict = {"precision": 0, "recall": 0, "f1": 0} + + for img in range(20): + response = get_response(test_dataset, model, tokenizer, img) + reference = test_dataset[img]["text"] + accuracy = evaluate(response, reference) + + response_dict["precision"] += accuracy["precision"] + response_dict["recall"] += accuracy["recall"] + response_dict["f1"] += accuracy["f1"] + + precision = response_dict["precision"] / 20 + recall = response_dict["recall"] / 20 + f1 = response_dict["f1"] / 20 + + print(f"Used Memory: {used_memory} GB") + print(f"Used Memory for LoRA: {used_memory_for_lora} GB") + + print(f"VQA Accuracy: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/convert_to_conversation/convert_to_conversation.py b/experiments/convert_to_conversation/convert_to_conversation.py new file mode 100644 index 0000000..81a79bc --- /dev/null +++ b/experiments/convert_to_conversation/convert_to_conversation.py @@ -0,0 +1,204 @@ +import os +import json +from typing import Optional, Dict, Union, Tuple +import random +import pandas as pd +from sklearn.model_selection import train_test_split +from datasets import load_dataset, Dataset, DatasetDict +from PIL import Image + +def detect_split(dataset): + if isinstance(dataset, DatasetDict): + splits = {k: v for k, v in dataset.items() if k in ['train', 'validation', 'test']} + print(f"Found splits: {list(splits.keys())}") + return True, splits + return False, {'dataset': dataset} + +def split_data_function(dataset, train_percent=100.0, val_percent=0.0, test_percent=0.0, seed=42): + total_percent = train_percent + val_percent + test_percent + if not (0 <= train_percent <= 100 and 0 <= val_percent <= 100 and 0 <= test_percent <= 100): + raise ValueError("Percentages must be between 0 and 100.") + if total_percent == 0: + raise ValueError("At least one split percentage must be greater than 0.") + + if total_percent != 100: + train_percent = (train_percent / total_percent) * 100 + val_percent = (val_percent / total_percent) * 100 + test_percent = (test_percent / total_percent) * 100 + print(f"Normalized percentages: train={train_percent:.1f}%, val={val_percent:.1f}%, test={test_percent:.1f}%") + + splits = {} + remaining_data = dataset + total_size = len(dataset) + train_size = int((train_percent / 100) * total_size) + val_size = int((val_percent / 100) * total_size) + test_size = int((test_percent / 100) * total_size) + + print(f"Splitting dataset: train={train_size}, val={val_size}, test={test_size}") + + if train_percent > 0: + if val_percent > 0 or test_percent > 0: + train, remaining_data = train_test_split( + remaining_data, train_size=train_size, random_state=seed + ) + else: + train = remaining_data + splits['train'] = train + + if val_percent > 0 and remaining_data: + if test_percent > 0: + val, remaining_data = train_test_split( + remaining_data, train_size=val_size, random_state=seed + ) + else: + val = remaining_data + splits['validation'] = val + + if test_percent > 0 and remaining_data: + splits['test'] = remaining_data + + return splits + +def make_serializable(obj): + if isinstance(obj, Image.Image): + return f"PIL.Image.Image(mode={obj.mode}, size={obj.size})" + return str(obj) + +def format_as_unsloth(sample, split="train", caption_col=None, data_category="caption", image_col=None, image_path_prefix=None, prompt_text=None): + if prompt_text is None: + prompt_text = ( + "Answer the question based on the image." if data_category == "vqa" + else "Describe this image in detail." + ) + + messages = [] + + if split in ["train", "validation"]: + + user_content = [] + if image_col and image_col in sample and sample[image_col]: + if image_path_prefix and isinstance(sample[image_col], str): + image_path = os.path.join(image_path_prefix, sample[image_col]) + user_content.append({"type": "image_url", "image_url": {"url": image_path}}) + else: + user_content.append({"type": "image", "image": sample[image_col]}) + + user_content.append({"type": "text", "text": prompt_text}) + messages.append({"role": "user", "content": user_content}) + + if caption_col and caption_col in sample and sample[caption_col]: + messages.append({"role": "assistant", "content": sample[caption_col]}) + + else: + user_content = [] + if image_col and image_col in sample and sample[image_col]: + if image_path_prefix and isinstance(sample[image_col], str): + image_path = os.path.join(image_path_prefix, sample[image_col]) + user_content.append({"type": "image_url", "image_url": {"url": image_path}}) + else: + user_content.append({"type": "image", "image": sample[image_col]}) + + user_content.append({"type": "text", "text": prompt_text}) + messages.append({"role": "user", "content": user_content}) + + return {"messages": messages} + +def convert_to_unsloth( + input_path_or_name, + dataset_type="hf", + data_category="caption", + train_percent=100.0, + val_percent=0.0, + test_percent=0.0, + prompt_text=None, + caption_col=None, + image_col=None, + image_path_prefix=None +): + if image_col is None: + image_col = "image" + if caption_col is None: + caption_col = "answers" if data_category == "vqa" else "caption" + + if dataset_type == "hf": + try: + dataset = load_dataset(input_path_or_name) + except Exception as e: + print(f"Error loading dataset: {e}") + return None + else: + try: + with open(input_path_or_name, "r", encoding="utf-8") as f: + content = f.read().strip() + if content.startswith('['): + lines = json.loads(content) + else: + lines = [json.loads(l.strip()) for l in content.splitlines() if l.strip()] + dataset = lines + except Exception as e: + print(f"Error loading JSON file: {e}") + return None + + is_split, splits = detect_split(dataset) + + if is_split: + available_splits = set(splits.keys()) + requested_splits = { + name for name, pct in zip(["train", "validation", "test"], [train_percent, val_percent, test_percent]) if pct > 0 + } + + if not requested_splits.issubset(available_splits): + combined_data = [] + for split_part in splits.values(): + combined_data.extend([dict(row) for row in split_part]) + splits = split_data_function( + combined_data, + train_percent=train_percent, + val_percent=val_percent, + test_percent=test_percent, + ) + splits = {k: Dataset.from_list(v) for k, v in splits.items()} + else: + if isinstance(dataset, DatasetDict): + dataset = [dict(row) for row in dataset] + splits = split_data_function( + dataset, + train_percent=train_percent, + val_percent=val_percent, + test_percent=test_percent, + ) + splits = {k: Dataset.from_list(v) for k, v in splits.items()} + + for split_name, split_data in splits.items(): + print(f"{split_name.capitalize()}: {len(split_data)} samples") + + formatted_outputs = {} + + for split_name, split_data in splits.items(): + if len(split_data) > 0: + formatted_sample = format_as_unsloth( + split_data[0], + split=split_name, + data_category=data_category, + prompt_text=prompt_text, + caption_col=caption_col, + image_col=image_col, + image_path_prefix=image_path_prefix + ) + serializable_formatted = json.loads(json.dumps(formatted_sample, default=make_serializable)) + print(f"\nFormatted sample from '{split_name}' split:") + print(json.dumps(serializable_formatted, indent=2, ensure_ascii=False)) + formatted_outputs[split_name] = formatted_sample + + return formatted_outputs + +if __name__ == "__main__": + formatted_outputs = convert_to_unsloth( + input_path_or_name="unsloth/Radiology_mini", + dataset_type="hf", + data_category="caption", + train_percent=60.0, + val_percent=20.0, + test_percent=20.0, + ) +