【开源实习】Bigbird pegasus模型微调 (#1994)

outbreak-sen · web-flow · commit d6d5fda97210 · 2025-03-18T14:31:51.000+08:00
diff --git a/llm/finetune/bigbird_pagesus/README.md b/llm/finetune/bigbird_pagesus/README.md
@@ -1,25 +1,60 @@
-# bigbird_pegasus模型微调对比
-## train loss
-
-对比微调训练的loss变化
-
-| epoch | mindnlp+mindspore | transformer+torch（4060）  |transformer+torch（4060,another time）  | 
-| ----- | ----------------- | ------------------------- |------------------------- |
-| 1     | 2.0958            | 8.7301                    |5.4650                     |
-| 2     | 1.969             | 8.1557                    |4.6890                     |
-| 3     | 1.8755            | 7.7516                    |4.2572                     |
-| 4     | 1.8264            | 7.5017                    |4.0263                     |
-| 5     | 1.7349            | 7.2614                    |3.9444                     |
-| 6     | 1.678             | 7.0559                    |3.8428                     |
-| 7     | 1.6937            | 6.8405                    |3.7187                     |
-| 8     | 1.654             | 6.7297                    |3.7192                     |
-| 9     | 1.6365            | 6.7136                    |3.5434                     |
-| 10    | 1.7003            | 6.6279                    |3.5881                     |
-
-## eval loss                        
-
-对比评估得分
-
-| epoch | mindnlp+mindspore  | transformer+torch（4060） | transformer+torch（4060） |
-| ----- | ------------------ | ------------------------- |------------------------- |
-| 1     | 2.1257965564727783 | 6.3235931396484375        |4.264792442321777         |
+# bigbird_pegasus微调
+实现了bigbird_pegasus模型在google/Synthetic-Persona-Chat数据集上的微调实验。
+任务链接在https://gitee.com/mindspore/community/issues/IAUPBF
+transformers+pytorch+3090的benchmark是自己编写的，仓库位于https://github.com/outbreak-sen/bigbird_pegasus_finetune
+更改代码位于llm/finetune/bigbird_prgasus，只包含mindnlp+mindspore的
+实验结果如下
+## Loss Values 表格
+
+| 序号 | MindNLP    | PyTorch |
+|------|-----------|---------|
+| 1    | 0.1826    | 7.6556  |
+| 2    | 0.1614    | 0.5960  |
+| 3    | 0.1435    | 0.4145  |
+| 4    | 0.1398    | 0.3022  |
+| 5    | 0.1344    | 0.2555  |
+| 6    | 0.1263    | 0.2357  |
+| 7    | 0.1200    | 0.2247  |
+| 8    | 0.1147    | 0.2166  |
+| 9    | 0.1105    | 0.2107  |
+| 10   | 0.1082    | 0.2075  |
+
+## Eval Loss Values 表格
+
+| 序号 | MindNLP    | PyTorch |
+|------|-----------|---------|
+| 1    | 0.2397    | 0.8738  |
+| 2    | 0.2451    | 0.4804  |
+| 3    | 0.2530    | 0.3490  |
+| 4    | 0.2548    | 0.2861  |
+| 5    | 0.2595    | 0.2669  |
+| 6    | 0.2663    | 0.2612  |
+| 7    | 0.2690    | 0.2545  |
+| 8    | 0.2755    | 0.2526  |
+| 9    | 0.2791    | 0.2519  |
+| 10   | 0.2831    | 0.2510  |
+| 11   | 0.2831    | 0.2510  |
+
+## 对话测试
+
+问题来自评估数据集的第一个问题
+
+* 问题输入：
+
+  Nice to meet you too. What are you interested in?
+
+* mindnlp未微调前的回答：
+
+  we present a new method for the detection of rare events, based on the use of time - frequency combs.<n> we show how this technique can be used to detect rare events in a broad range of time - frequency domains.<n> we also show how this technique can be used to study the evolution of the spectrum of rare events. <n> rare events ; amplitude ; phase ; amplitude ; frequency ; time - frequency combs + _ pacs : _<n> 11.30.er, 12.20.fv, 12.20.ds, 12.60.jv, 12.60.jv @xmath0 department of physics and astronomy, iowa state university, ames, ia 50011 + @xmath1 department of physics and astronomy, university of iowa, ames, ia 50011 + @xmath2 department of physics and astronomy, university of iowa, ames, ia 50011 + _ key words : _ rare events ; amplitude ; phase ; frequency ; spectrum ; time - frequency combs + _ pacs : _<n> 11.30.er 
+
+* mindnlp微调后的回答：
+
+  I'm interested in a lot of things, but I'm especially interested in history and science.
+
+* torch微调前的回答：
+
+  we present a new method for the detection of rare events , based on the use of time - frequency combs .<n> we show how this technique can be used to detect rare events in a broad range of time - frequency domains .<n> we also show how this technique can be used to study the evolution of the spectrum of rare events . <n> rare events ; amplitude ; phase ; amplitude ; frequency ; time - frequency combs + _ pacs : _<n> 11.30.er , 12.20.fv , 12.20.ds , 12.60.jv , 12.60.jv @xmath0 department of physics and astronomy , iowa state university , ames , ia 50011 + @xmath1 department of physics and astronomy , university of iowa , ames , ia 50011 + @xmath2 department of physics and astronomy , university of iowa , ames , ia 50011 + _ key words : _ rare events ; amplitude ; phase ; frequency ; spectrum ; time - frequency combs + _ pacs : _<n> 11.30.er 
+
+* torch微调后的回答：
+
+  how do you like to do for fun?
diff --git a/llm/finetune/bigbird_pagesus/mindNLPBigBirdPagesusPersona.py b/llm/finetune/bigbird_pagesus/mindNLPBigBirdPagesusPersona.py
@@ -0,0 +1,189 @@
+from mindnlp.transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
+from mindnlp.engine import Trainer, TrainingArguments
+from datasets import load_dataset, load_from_disk
+import mindspore as ms
+import os
+
+# 设置运行模式和设备
+ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
+# 设置 HF_ENDPOINT 环境变量
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+# 加载模型和分词器
+print("加载模型和分词器")
+model_name = "google/bigbird-pegasus-large-arxiv"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = BigBirdPegasusForConditionalGeneration.from_pretrained(model_name)
+print("模型和分词器加载完成")
+input = "Nice to meet you too. What are you interested in?"
+print("input question:", input)
+input_tokens = tokenizer([input], return_tensors="ms")
+output_tokens = model.generate(**input_tokens)
+print("output answer:", tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0])
+
+print("加载数据集")
+# 定义数据集保存路径
+dataset_path = "./Persona_valid_preprocessed"
+# 检查是否存在处理好的数据集
+if os.path.exists(dataset_path):
+    # 加载预处理后的数据集
+    dataset_train = load_from_disk("./Persona_train_preprocessed")
+    dataset_valid = load_from_disk("./Persona_valid_preprocessed")
+else:
+    dataset = load_dataset("google/Synthetic-Persona-Chat")
+    print("dataset finished")
+    print("dataset:", dataset)
+    print("dataset['train'][0]:", dataset["train"][0])
+    dataset_train = dataset["train"]
+    dataset_valid = dataset["validation"]
+    print("dataset_train:", dataset_train)
+    print("dataset_train['Best Generated Conversation'][0]:\n", 
+          dataset_train["Best Generated Conversation"][0])
+    print("dataset_train['user 1 personas'][0]:", 
+          dataset_train["user 1 personas"][0])
+    print("dataset_train['user 2 personas'][0]:", 
+          dataset_train["user 2 personas"][0])
+    print("dataset_train.column_names:", 
+          dataset_train.column_names)
+    # 数据预处理：将对话格式化为上下文-回复对
+    def format_dialogue(examples):
+        inputs, targets = [], []
+        for conversation in examples["Best Generated Conversation"]:
+            # 将对话按行拆分
+            lines = conversation.split("\n")
+            # 将对话拆分为上下文和回复
+            # print("lines_range:", len(lines) - 1)
+            for i in range(len(lines) - 1):
+                context = "\n".join(lines[:i+1])  # 上下文是当前行及之前的所有行
+                reply = lines[i+1]  # 下一行是回复
+                context = context.replace("User 1: ", "")
+                context = context.replace("User 2: ", "")
+                if context.strip() and reply.strip():  # 确保上下文和回复不为空
+                    inputs.append(context.strip())
+                    targets.append(reply.strip())
+        # print(f"Best Generated Conversation: {len(examples['Best Generated Conversation'])}")
+        # print(f"user 1 personas: {len(examples['user 1 personas'])}")
+        # print(f"inputs length: {len(inputs)}, targets length: {len(targets)}")
+        return {"input": inputs, "target": targets}
+
+    # 应用预处理函数
+    dataset_train = dataset_train.map(format_dialogue, batched=True
+                                        , remove_columns=["user 1 personas"
+                                                            , "user 2 personas"
+                                                            , "Best Generated Conversation"])
+    dataset_valid = dataset_valid.map(format_dialogue, batched=True
+                                        , remove_columns=["user 1 personas"
+                                                            , "user 2 personas"
+                                                            , "Best Generated Conversation"])
+    # 保存预处理后的数据集
+    dataset_train.save_to_disk("./Persona_train_preprocessed")
+    dataset_valid.save_to_disk("./Persona_valid_preprocessed")
+print("tokenizer数据集")
+# 定义数据集保存路径
+dataset_path = "./PersonaTokenized_train_preprocessed"
+# 检查是否存在处理好的数据集
+if os.path.exists(dataset_path):
+    # 加载预处理后的数据集
+    dataset_train_tokenized = load_from_disk("./PersonaTokenized_train_preprocessed")
+    dataset_valid_tokenized= load_from_disk("./PersonaTokenized_valid_preprocessed")
+else:
+    # 分词处理
+    def tokenize_function(examples):
+        model_inputs = tokenizer(
+            examples["input"],
+            max_length=128,
+            truncation=True,
+            padding="max_length",
+        )
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(
+                examples["target"],
+                max_length=128,
+                truncation=True,
+                padding="max_length",
+            )
+        model_inputs["labels"] = labels["input_ids"]#获得"labels" "input_ids" "attention_mask"
+        return model_inputs
+
+    dataset_train_tokenized = dataset_train.map(tokenize_function)
+    dataset_valid_tokenized = dataset_valid.map(tokenize_function)
+    dataset_train_tokenized = dataset_train_tokenized.filter(lambda example: len(example["input_ids"]) > 0 and len(example["labels"]) > 0)
+    dataset_valid_tokenized = dataset_valid_tokenized.filter(lambda example: len(example["input_ids"]) > 0 and len(example["labels"]) > 0)
+    # 保存预处理后的数据集
+    dataset_train_tokenized.save_to_disk("./PersonaTokenized_train_preprocessed")
+    dataset_valid_tokenized.save_to_disk("./PersonaTokenized_valid_preprocessed")
+# 计算百分之一的数据量
+train_size = len(dataset_train_tokenized)
+valid_size = len(dataset_valid_tokenized)
+train_subset_size = train_size // 100
+valid_subset_size = valid_size // 100
+# 使用 select 方法选择前百分之一的数据
+dataset_train_tokenized = dataset_train_tokenized.select(range(train_subset_size))
+dataset_valid_tokenized = dataset_valid_tokenized.select(range(valid_subset_size))
+print("dataset_train_tokenized:",dataset_train_tokenized)
+print("dataset_valid_tokenized:",dataset_valid_tokenized)
+
+import numpy as np
+def data_generator(dataset):
+    for item in dataset:
+        yield (
+            np.array(item["input_ids"], dtype=np.int32),  # input_ids
+            np.array(item["attention_mask"], dtype=np.int32),  # attention_mask
+            np.array(item["labels"], dtype=np.int32)  # label
+        )
+import mindspore.dataset as ds
+# 将训练集和验证集转换为 MindSpore 数据集，注意forward函数中label要改成labels
+def create_mindspore_dataset(dataset, shuffle=True):
+    return ds.GeneratorDataset(
+        source=lambda: data_generator(dataset),  # 使用 lambda 包装生成器
+        column_names=["input_ids", "attention_mask", "labels"],
+        shuffle=shuffle
+    )
+dataset_train_tokenized = create_mindspore_dataset(dataset_train_tokenized, shuffle=True)
+dataset_valid_tokenized = create_mindspore_dataset(dataset_valid_tokenized, shuffle=False)
+
+TOKENS = 20
+EPOCHS = 10
+BATCH_SIZE = 4
+
+training_args = TrainingArguments(
+    output_dir='./MindNLP_BigBirdPegasus_persona_finetuned',
+    overwrite_output_dir=True,
+    num_train_epochs=EPOCHS,
+    per_device_train_batch_size=BATCH_SIZE,
+    per_device_eval_batch_size=BATCH_SIZE,
+
+    save_steps=500,                  # Save checkpoint every 500 steps
+    save_total_limit=2,              # Keep only the last 2 checkpoints
+    logging_dir="./MindNLP_logs",            # Directory for logs
+    logging_steps=100,               # Log every 100 steps
+    logging_strategy="epoch",
+    evaluation_strategy="epoch",
+    eval_steps=500,                  # Evaluation frequency
+    warmup_steps=100,
+    learning_rate=5e-5,
+    weight_decay=0.01,               # Weight decay
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset_train_tokenized,
+    eval_dataset=dataset_valid_tokenized
+)
+print("开始训练")
+# 开始训练
+trainer.train()
+eval_results = trainer.evaluate()
+print(f"Evaluation results: {eval_results}")
+
+model.save_pretrained("./MindNLP_BigBirdPegasus_persona_finetuned")
+tokenizer.save_pretrained("./MindNLP_BigBirdPegasus_persona_finetuned")
+fine_tuned_model = BigBirdPegasusForConditionalGeneration.from_pretrained("./MindNLP_BigBirdPegasus_persona_finetuned")
+fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./MindNLP_BigBirdPegasus_persona_finetuned")
+# 再次测试对话
+print("再次测试对话")
+input = "Nice to meet you too. What are you interested in?"
+print("input question:", input)
+input_tokens = fine_tuned_tokenizer([input], return_tensors="ms")
+output_tokens = fine_tuned_model.generate(**input_tokens)
+print("output answer:", fine_tuned_tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0])
diff --git a/llm/finetune/bigbird_pagesus/mindNLPBigBirdpegasuslogHundrandPersona.txt b/llm/finetune/bigbird_pagesus/mindNLPBigBirdpegasuslogHundrandPersona.txt