NLP-kr
diff --git a/‎7.PRETRAIN_METHOD/7.2.1.bert_finetune_NSMC.ipynb‎
Lines changed: 672 additions & 0 deletions b/‎7.PRETRAIN_METHOD/7.2.1.bert_finetune_NSMC.ipynb‎
Lines changed: 672 additions & 0 deletions
diff --git a/‎7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb‎
Lines changed: 398 additions & 0 deletions b/‎7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb‎
Lines changed: 398 additions & 0 deletions
@@ -0,0 +1,398 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import tensorflow as tf\n",
+    "from transformers import BertTokenizer, TFBertModel\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+    "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 시각화\n",
+    "\n",
+    "def plot_graphs(history, string):\n",
+    "    plt.plot(history.history[string])\n",
+    "    plt.plot(history.history['val_'+string], '')\n",
+    "    plt.xlabel(\"Epochs\")\n",
+    "    plt.ylabel(string)\n",
+    "    plt.legend([string, 'val_'+string])\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#random seed 고정\n",
+    "\n",
+    "tf.random.set_seed(1234)\n",
+    "np.random.seed(1234)\n",
+    "\n",
+    "# BASE PARAM\n",
+    "\n",
+    "BATCH_SIZE = 32\n",
+    "NUM_EPOCHS = 3\n",
+    "MAX_LEN = 24 * 2 # Average total * 2\n",
+    "\n",
+    "DATA_IN_PATH = './data_in/KOR'\n",
+    "DATA_OUT_PATH = \"./data_out/KOR\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# KorNLI Dataset\n",
+    "\n",
+    "Data from Kakaobrain:  https://github.com/kakaobrain/KorNLUDatasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load Train dataset\n",
+    "\n",
+    "TRAIN_SNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'snli_1.0_train.kor.tsv')\n",
+    "TRAIN_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'multinli.train.ko.tsv')\n",
+    "DEV_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.dev.ko.tsv')\n",
+    "\n",
+    "train_data_snli = pd.read_csv(TRAIN_SNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
+    "train_data_xnli = pd.read_csv(TRAIN_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
+    "dev_data_xnli = pd.read_csv(DEV_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
+    "\n",
+    "train_data_snli_xnli = train_data_snli.append(train_data_xnli)\n",
+    "train_data_snli_xnli = train_data_snli_xnli.dropna()\n",
+    "train_data_snli_xnli = train_data_snli_xnli.reset_index()\n",
+    "\n",
+    "dev_data_xnli = dev_data_xnli.dropna()\n",
+    "\n",
+    "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data_snli_xnli), len(dev_data_xnli)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bert Tokenizer\n",
+    "\n",
+    "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n",
+    "\n",
+    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt', do_lower_case=False)\n",
+    "\n",
+    "def bert_tokenizer_v2(sent1, sent2, MAX_LEN):\n",
+    "    \n",
+    "    # For Two setenece input\n",
+    "    \n",
+    "    encoded_dict = tokenizer.encode_plus(\n",
+    "        text = sent1,\n",
+    "        text_pair = sent2,\n",
+    "        add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
+    "        max_length = MAX_LEN,           # Pad & truncate all sentences.\n",
+    "        pad_to_max_length = True,\n",
+    "        return_attention_mask = True   # Construct attn. masks.\n",
+    "        \n",
+    "    )\n",
+    "    \n",
+    "    input_id = encoded_dict['input_ids']\n",
+    "    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n",
+    "    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n",
+    "    \n",
+    "    return input_id, attention_mask, token_type_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids = []\n",
+    "attention_masks = []\n",
+    "token_type_ids = []\n",
+    "\n",
+    "for sent1, sent2 in zip(train_data_snli_xnli['sentence1'], train_data_snli_xnli['sentence2']):\n",
+    "    try:\n",
+    "        input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
+    "\n",
+    "        input_ids.append(input_id)\n",
+    "        attention_masks.append(attention_mask)\n",
+    "        token_type_ids.append(token_type_id)\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        print(sent1, sent2)\n",
+    "        pass\n",
+    "    \n",
+    "train_snli_xnli_input_ids = np.array(input_ids, dtype=int)\n",
+    "train_snli_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
+    "train_snli_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
+    "train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DEV SET Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
+    "input_ids = []\n",
+    "attention_masks = []\n",
+    "token_type_ids = []\n",
+    "\n",
+    "for sent1, sent2 in zip(dev_data_xnli['sentence1'], dev_data_xnli['sentence2']):\n",
+    "    try:\n",
+    "        input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
+    "\n",
+    "        input_ids.append(input_id)\n",
+    "        attention_masks.append(attention_mask)\n",
+    "        token_type_ids.append(token_type_id)\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        print(sent1, sent2)\n",
+    "        pass\n",
+    "    \n",
+    "dev_xnli_input_ids = np.array(input_ids, dtype=int)\n",
+    "dev_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
+    "dev_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
+    "dev_xnli_inputs = (dev_xnli_input_ids, dev_xnli_attention_masks, dev_xnli_type_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Label을 Netural, Contradiction, Entailment 에서 숫자 형으로 변경한다.\n",
+    "label_dict = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n",
+    "def convert_int(label):\n",
+    "    num_label = label_dict[label]    \n",
+    "    return num_label\n",
+    "\n",
+    "train_data_snli_xnli[\"gold_label_int\"] = train_data_snli_xnli[\"gold_label\"].apply(convert_int)\n",
+    "train_data_labels = np.array(train_data_snli_xnli['gold_label_int'], dtype=int)\n",
+    "\n",
+    "dev_data_xnli[\"gold_label_int\"] = dev_data_xnli[\"gold_label\"].apply(convert_int)\n",
+    "dev_data_labels = np.array(dev_data_xnli['gold_label_int'], dtype=int)\n",
+    "\n",
+    "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TFBertClassifier(tf.keras.Model):\n",
+    "    def __init__(self, model_name, dir_path, num_class):\n",
+    "        super(TFBertClassifier, self).__init__()\n",
+    "\n",
+    "        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
+    "        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n",
+    "        self.classifier = tf.keras.layers.Dense(num_class, \n",
+    "                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n",
+    "                                                name=\"classifier\")\n",
+    "        \n",
+    "    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n",
+    "        \n",
+    "        #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n",
+    "        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
+    "        pooled_output = outputs[1] \n",
+    "        pooled_output = self.dropout(pooled_output, training=training)\n",
+    "        logits = self.classifier(pooled_output)\n",
+    "\n",
+    "        return logits\n",
+    "\n",
+    "cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',\n",
+    "                                  dir_path='bert_ckpt',\n",
+    "                                  num_class=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 학습 준비하기\n",
+    "optimizer = tf.keras.optimizers.Adam(3e-5)\n",
+    "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
+    "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
+    "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#학습 진행하기\n",
+    "model_name = \"tf2_KorNLI\"\n",
+    "\n",
+    "# overfitting을 막기 위한 ealrystop 추가\n",
+    "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)\n",
+    "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
+    "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
+    "\n",
+    "# Create path if exists\n",
+    "if os.path.exists(checkpoint_dir):\n",
+    "    print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
+    "else:\n",
+    "    os.makedirs(checkpoint_dir, exist_ok=True)\n",
+    "    print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
+    "    \n",
+    "cp_callback = ModelCheckpoint(\n",
+    "    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
+    "\n",
+    "# 학습과 eval 시작\n",
+    "history = cls_model.fit(train_snli_xnli_inputs, train_data_labels, epochs=NUM_EPOCHS,\n",
+    "            validation_data = (dev_xnli_inputs, dev_data_labels),\n",
+    "            batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])\n",
+    "\n",
+    "#steps_for_epoch\n",
+    "print(history.history)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_graphs(history, 'accuracy')\n",
+    "plot_graphs(history, 'loss')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# KorNLI Test dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load Test dataset\n",
+    "TEST_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.test.ko.tsv')\n",
+    "\n",
+    "test_data_xnli = pd.read_csv(TEST_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
+    "test_data_xnli = test_data_xnli.dropna()\n",
+    "test_data_xnli.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test set도 똑같은 방법으로 구성한다.\n",
+    "\n",
+    "input_ids = []\n",
+    "attention_masks = []\n",
+    "token_type_ids = []\n",
+    "\n",
+    "for sent1, sent2 in zip(test_data_xnli['sentence1'], test_data_xnli['sentence2']):\n",
+    "    \n",
+    "    try:\n",
+    "        input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
+    "\n",
+    "        input_ids.append(input_id)\n",
+    "        attention_masks.append(attention_mask)\n",
+    "        token_type_ids.append(token_type_id)\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "        print(sent1, sent2)\n",
+    "        pass\n",
+    "    \n",
+    "    \n",
+    "test_xnli_input_ids = np.array(input_ids, dtype=int)\n",
+    "test_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
+    "test_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
+    "test_xnli_inputs = (test_xnli_input_ids, test_xnli_attention_masks, test_xnli_type_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data_xnli[\"gold_label_int\"] = test_data_xnli[\"gold_label\"].apply(convert_int)\n",
+    "test_data_xnli_labels = np.array(test_data_xnli['gold_label_int'], dtype=int)\n",
+    "\n",
+    "print(\"# sents: {}, # labels: {}\".format(len(test_xnli_input_ids), len(test_data_xnli_labels)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = cls_model.evaluate(test_xnli_inputs, test_data_xnli_labels, batch_size=512)\n",
+    "print(\"test loss, test acc: \", results)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}