Skip to content

Commit e916acb

Browse files
committed
💬 Update file name
1 parent 811af76 commit e916acb

8 files changed

+4346
-0
lines changed

7.PRETRAIN_METHOD/7.2.1.bert_finetune_NSMC.ipynb

Lines changed: 672 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 398 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import os\n",
10+
"import tensorflow as tf\n",
11+
"from transformers import BertTokenizer, TFBertModel\n",
12+
"\n",
13+
"import numpy as np\n",
14+
"import pandas as pd\n",
15+
"\n",
16+
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
17+
"from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
18+
"\n",
19+
"import matplotlib.pyplot as plt"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"# 시각화\n",
29+
"\n",
30+
"def plot_graphs(history, string):\n",
31+
" plt.plot(history.history[string])\n",
32+
" plt.plot(history.history['val_'+string], '')\n",
33+
" plt.xlabel(\"Epochs\")\n",
34+
" plt.ylabel(string)\n",
35+
" plt.legend([string, 'val_'+string])\n",
36+
" plt.show()"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"#random seed 고정\n",
46+
"\n",
47+
"tf.random.set_seed(1234)\n",
48+
"np.random.seed(1234)\n",
49+
"\n",
50+
"# BASE PARAM\n",
51+
"\n",
52+
"BATCH_SIZE = 32\n",
53+
"NUM_EPOCHS = 3\n",
54+
"MAX_LEN = 24 * 2 # Average total * 2\n",
55+
"\n",
56+
"DATA_IN_PATH = './data_in/KOR'\n",
57+
"DATA_OUT_PATH = \"./data_out/KOR\""
58+
]
59+
},
60+
{
61+
"cell_type": "markdown",
62+
"metadata": {},
63+
"source": [
64+
"# KorNLI Dataset\n",
65+
"\n",
66+
"Data from Kakaobrain: https://github.com/kakaobrain/KorNLUDatasets"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": null,
72+
"metadata": {},
73+
"outputs": [],
74+
"source": [
75+
"# Load Train dataset\n",
76+
"\n",
77+
"TRAIN_SNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'snli_1.0_train.kor.tsv')\n",
78+
"TRAIN_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'multinli.train.ko.tsv')\n",
79+
"DEV_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.dev.ko.tsv')\n",
80+
"\n",
81+
"train_data_snli = pd.read_csv(TRAIN_SNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
82+
"train_data_xnli = pd.read_csv(TRAIN_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
83+
"dev_data_xnli = pd.read_csv(DEV_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
84+
"\n",
85+
"train_data_snli_xnli = train_data_snli.append(train_data_xnli)\n",
86+
"train_data_snli_xnli = train_data_snli_xnli.dropna()\n",
87+
"train_data_snli_xnli = train_data_snli_xnli.reset_index()\n",
88+
"\n",
89+
"dev_data_xnli = dev_data_xnli.dropna()\n",
90+
"\n",
91+
"print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data_snli_xnli), len(dev_data_xnli)))"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": null,
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"# Bert Tokenizer\n",
101+
"\n",
102+
"# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n",
103+
"\n",
104+
"tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt', do_lower_case=False)\n",
105+
"\n",
106+
"def bert_tokenizer_v2(sent1, sent2, MAX_LEN):\n",
107+
" \n",
108+
" # For Two setenece input\n",
109+
" \n",
110+
" encoded_dict = tokenizer.encode_plus(\n",
111+
" text = sent1,\n",
112+
" text_pair = sent2,\n",
113+
" add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
114+
" max_length = MAX_LEN, # Pad & truncate all sentences.\n",
115+
" pad_to_max_length = True,\n",
116+
" return_attention_mask = True # Construct attn. masks.\n",
117+
" \n",
118+
" )\n",
119+
" \n",
120+
" input_id = encoded_dict['input_ids']\n",
121+
" attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n",
122+
" token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n",
123+
" \n",
124+
" return input_id, attention_mask, token_type_id"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": null,
130+
"metadata": {},
131+
"outputs": [],
132+
"source": [
133+
"input_ids = []\n",
134+
"attention_masks = []\n",
135+
"token_type_ids = []\n",
136+
"\n",
137+
"for sent1, sent2 in zip(train_data_snli_xnli['sentence1'], train_data_snli_xnli['sentence2']):\n",
138+
" try:\n",
139+
" input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
140+
"\n",
141+
" input_ids.append(input_id)\n",
142+
" attention_masks.append(attention_mask)\n",
143+
" token_type_ids.append(token_type_id)\n",
144+
" except Exception as e:\n",
145+
" print(e)\n",
146+
" print(sent1, sent2)\n",
147+
" pass\n",
148+
" \n",
149+
"train_snli_xnli_input_ids = np.array(input_ids, dtype=int)\n",
150+
"train_snli_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
151+
"train_snli_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
152+
"train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)"
153+
]
154+
},
155+
{
156+
"cell_type": "markdown",
157+
"metadata": {},
158+
"source": [
159+
"# DEV SET Preprocessing"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": null,
165+
"metadata": {},
166+
"outputs": [],
167+
"source": [
168+
"# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
169+
"input_ids = []\n",
170+
"attention_masks = []\n",
171+
"token_type_ids = []\n",
172+
"\n",
173+
"for sent1, sent2 in zip(dev_data_xnli['sentence1'], dev_data_xnli['sentence2']):\n",
174+
" try:\n",
175+
" input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
176+
"\n",
177+
" input_ids.append(input_id)\n",
178+
" attention_masks.append(attention_mask)\n",
179+
" token_type_ids.append(token_type_id)\n",
180+
" except Exception as e:\n",
181+
" print(e)\n",
182+
" print(sent1, sent2)\n",
183+
" pass\n",
184+
" \n",
185+
"dev_xnli_input_ids = np.array(input_ids, dtype=int)\n",
186+
"dev_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
187+
"dev_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
188+
"dev_xnli_inputs = (dev_xnli_input_ids, dev_xnli_attention_masks, dev_xnli_type_ids)"
189+
]
190+
},
191+
{
192+
"cell_type": "code",
193+
"execution_count": null,
194+
"metadata": {
195+
"scrolled": true
196+
},
197+
"outputs": [],
198+
"source": [
199+
"# Label을 Netural, Contradiction, Entailment 에서 숫자 형으로 변경한다.\n",
200+
"label_dict = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n",
201+
"def convert_int(label):\n",
202+
" num_label = label_dict[label] \n",
203+
" return num_label\n",
204+
"\n",
205+
"train_data_snli_xnli[\"gold_label_int\"] = train_data_snli_xnli[\"gold_label\"].apply(convert_int)\n",
206+
"train_data_labels = np.array(train_data_snli_xnli['gold_label_int'], dtype=int)\n",
207+
"\n",
208+
"dev_data_xnli[\"gold_label_int\"] = dev_data_xnli[\"gold_label\"].apply(convert_int)\n",
209+
"dev_data_labels = np.array(dev_data_xnli['gold_label_int'], dtype=int)\n",
210+
"\n",
211+
"print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": null,
217+
"metadata": {},
218+
"outputs": [],
219+
"source": [
220+
"class TFBertClassifier(tf.keras.Model):\n",
221+
" def __init__(self, model_name, dir_path, num_class):\n",
222+
" super(TFBertClassifier, self).__init__()\n",
223+
"\n",
224+
" self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
225+
" self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n",
226+
" self.classifier = tf.keras.layers.Dense(num_class, \n",
227+
" kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n",
228+
" name=\"classifier\")\n",
229+
" \n",
230+
" def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n",
231+
" \n",
232+
" #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n",
233+
" outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
234+
" pooled_output = outputs[1] \n",
235+
" pooled_output = self.dropout(pooled_output, training=training)\n",
236+
" logits = self.classifier(pooled_output)\n",
237+
"\n",
238+
" return logits\n",
239+
"\n",
240+
"cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',\n",
241+
" dir_path='bert_ckpt',\n",
242+
" num_class=3)"
243+
]
244+
},
245+
{
246+
"cell_type": "code",
247+
"execution_count": null,
248+
"metadata": {},
249+
"outputs": [],
250+
"source": [
251+
"# 학습 준비하기\n",
252+
"optimizer = tf.keras.optimizers.Adam(3e-5)\n",
253+
"loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
254+
"metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
255+
"cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
256+
]
257+
},
258+
{
259+
"cell_type": "code",
260+
"execution_count": null,
261+
"metadata": {},
262+
"outputs": [],
263+
"source": [
264+
"#학습 진행하기\n",
265+
"model_name = \"tf2_KorNLI\"\n",
266+
"\n",
267+
"# overfitting을 막기 위한 ealrystop 추가\n",
268+
"earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)\n",
269+
"checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
270+
"checkpoint_dir = os.path.dirname(checkpoint_path)\n",
271+
"\n",
272+
"# Create path if exists\n",
273+
"if os.path.exists(checkpoint_dir):\n",
274+
" print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
275+
"else:\n",
276+
" os.makedirs(checkpoint_dir, exist_ok=True)\n",
277+
" print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
278+
" \n",
279+
"cp_callback = ModelCheckpoint(\n",
280+
" checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
281+
"\n",
282+
"# 학습과 eval 시작\n",
283+
"history = cls_model.fit(train_snli_xnli_inputs, train_data_labels, epochs=NUM_EPOCHS,\n",
284+
" validation_data = (dev_xnli_inputs, dev_data_labels),\n",
285+
" batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])\n",
286+
"\n",
287+
"#steps_for_epoch\n",
288+
"print(history.history)"
289+
]
290+
},
291+
{
292+
"cell_type": "code",
293+
"execution_count": null,
294+
"metadata": {},
295+
"outputs": [],
296+
"source": [
297+
"plot_graphs(history, 'accuracy')\n",
298+
"plot_graphs(history, 'loss')"
299+
]
300+
},
301+
{
302+
"cell_type": "markdown",
303+
"metadata": {},
304+
"source": [
305+
"# KorNLI Test dataset"
306+
]
307+
},
308+
{
309+
"cell_type": "code",
310+
"execution_count": null,
311+
"metadata": {},
312+
"outputs": [],
313+
"source": [
314+
"# Load Test dataset\n",
315+
"TEST_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.test.ko.tsv')\n",
316+
"\n",
317+
"test_data_xnli = pd.read_csv(TEST_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
318+
"test_data_xnli = test_data_xnli.dropna()\n",
319+
"test_data_xnli.head()"
320+
]
321+
},
322+
{
323+
"cell_type": "code",
324+
"execution_count": null,
325+
"metadata": {},
326+
"outputs": [],
327+
"source": [
328+
"# Test set도 똑같은 방법으로 구성한다.\n",
329+
"\n",
330+
"input_ids = []\n",
331+
"attention_masks = []\n",
332+
"token_type_ids = []\n",
333+
"\n",
334+
"for sent1, sent2 in zip(test_data_xnli['sentence1'], test_data_xnli['sentence2']):\n",
335+
" \n",
336+
" try:\n",
337+
" input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
338+
"\n",
339+
" input_ids.append(input_id)\n",
340+
" attention_masks.append(attention_mask)\n",
341+
" token_type_ids.append(token_type_id)\n",
342+
" except Exception as e:\n",
343+
" print(e)\n",
344+
" print(sent1, sent2)\n",
345+
" pass\n",
346+
" \n",
347+
" \n",
348+
"test_xnli_input_ids = np.array(input_ids, dtype=int)\n",
349+
"test_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
350+
"test_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
351+
"test_xnli_inputs = (test_xnli_input_ids, test_xnli_attention_masks, test_xnli_type_ids)"
352+
]
353+
},
354+
{
355+
"cell_type": "code",
356+
"execution_count": null,
357+
"metadata": {},
358+
"outputs": [],
359+
"source": [
360+
"test_data_xnli[\"gold_label_int\"] = test_data_xnli[\"gold_label\"].apply(convert_int)\n",
361+
"test_data_xnli_labels = np.array(test_data_xnli['gold_label_int'], dtype=int)\n",
362+
"\n",
363+
"print(\"# sents: {}, # labels: {}\".format(len(test_xnli_input_ids), len(test_data_xnli_labels)))"
364+
]
365+
},
366+
{
367+
"cell_type": "code",
368+
"execution_count": null,
369+
"metadata": {},
370+
"outputs": [],
371+
"source": [
372+
"results = cls_model.evaluate(test_xnli_inputs, test_data_xnli_labels, batch_size=512)\n",
373+
"print(\"test loss, test acc: \", results)"
374+
]
375+
}
376+
],
377+
"metadata": {
378+
"kernelspec": {
379+
"display_name": "Python 3",
380+
"language": "python",
381+
"name": "python3"
382+
},
383+
"language_info": {
384+
"codemirror_mode": {
385+
"name": "ipython",
386+
"version": 3
387+
},
388+
"file_extension": ".py",
389+
"mimetype": "text/x-python",
390+
"name": "python",
391+
"nbconvert_exporter": "python",
392+
"pygments_lexer": "ipython3",
393+
"version": "3.7.3"
394+
}
395+
},
396+
"nbformat": 4,
397+
"nbformat_minor": 2
398+
}

0 commit comments

Comments
 (0)