From eb53175654cdfbc529955fe7a3143e01a4f9bc03 Mon Sep 17 00:00:00 2001 From: sone Date: Thu, 26 May 2022 12:51:17 +0800 Subject: [PATCH 1/3] [fix] dkt error when batch_size=1 --- EduKTM/DKT/DKT.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/EduKTM/DKT/DKT.py b/EduKTM/DKT/DKT.py index f67cdcb..5f73eba 100644 --- a/EduKTM/DKT/DKT.py +++ b/EduKTM/DKT/DKT.py @@ -48,23 +48,22 @@ def train(self, train_data, test_data=None, *, epoch: int, lr=0.002) -> ...: optimizer = torch.optim.Adam(self.dkt_model.parameters(), lr) for e in range(epoch): - losses = [] + all_pred, all_target = torch.Tensor([]), torch.Tensor([]) for batch in tqdm.tqdm(train_data, "Epoch %s" % e): integrated_pred = self.dkt_model(batch) batch_size = batch.shape[0] - loss = torch.Tensor([0.0]) for student in range(batch_size): pred, truth = process_raw_pred(batch[student], integrated_pred[student], self.num_questions) - if pred.shape[0] != 0: - loss += loss_function(pred, truth.float()) + all_pred = torch.cat([all_pred, pred]) + all_target = torch.cat([all_target, truth.float()]) - # back propagation - optimizer.zero_grad() - loss.backward() - optimizer.step() + loss = loss_function(all_pred, all_target) + # back propagation + optimizer.zero_grad() + loss.backward() + optimizer.step() - losses.append(loss.mean().item()) - print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses)))) + print("[Epoch %d] LogisticLoss: %.6f" % (e, loss)) if test_data is not None: auc = self.eval(test_data) From 39340c3baf8d6e22cfdebe9af7469f36fc662d63 Mon Sep 17 00:00:00 2001 From: sone Date: Thu, 26 May 2022 12:53:19 +0800 Subject: [PATCH 2/3] [fix] akt name --- EduKTM/AKT/AKT.py | 6 +++--- examples/AKT/AKT.ipynb | 2 +- tests/akt/test_akt.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/EduKTM/AKT/AKT.py b/EduKTM/AKT/AKT.py index c6297ec..24f36bf 100644 --- a/EduKTM/AKT/AKT.py +++ b/EduKTM/AKT/AKT.py @@ -188,12 +188,12 @@ def train(self, train_data, test_data=None, *, epoch: int, lr=0.002) -> ...: optimizer = torch.optim.Adam(self.akt_net.parameters(), lr=lr, betas=(0.0, 0.999), eps=1e-8) for idx in range(epoch): - train_loss, train_accuracy, train_acc = train_one_epoch(self.akt_net, self.params, optimizer, *train_data) + train_loss, train_auc, train_accuracy = train_one_epoch(self.akt_net, self.params, optimizer, *train_data) print("[Epoch %d] LogisticLoss: %.6f" % (idx, train_loss)) if test_data is not None: - valid_loss, valid_accuracy, valid_acc = self.eval(test_data) - print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (idx, valid_acc, valid_accuracy)) + valid_loss, valid_auc, valid_accuracy = self.eval(test_data) + print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (idx, valid_auc, valid_accuracy)) def eval(self, test_data) -> ...: self.akt_net.eval() diff --git a/examples/AKT/AKT.ipynb b/examples/AKT/AKT.ipynb index dacf49f..280f0b9 100644 --- a/examples/AKT/AKT.ipynb +++ b/examples/AKT/AKT.ipynb @@ -9,7 +9,7 @@ } }, "source": [ - "# Deep Knowledge Tracing Plus (DKT+)\n", + "# Attentive Knowledge Tracing (AKT)\n", "\n", "This notebook will show you how to train and use the AKT.\n", "First, we will show how to get the data (here we use assistment-2009-2010-skill as the dataset).\n", diff --git a/tests/akt/test_akt.py b/tests/akt/test_akt.py index b96539a..ed4d081 100644 --- a/tests/akt/test_akt.py +++ b/tests/akt/test_akt.py @@ -17,6 +17,6 @@ def test_train(data, conf, tmp_path, maxgradnorm, separate_qa, kq_same): akt = AKT(n_question, n_pid, n_blocks, d_model, dropout, kq_same, l2, batch_size, maxgradnorm, separate_qa) akt.train(data, test_data=data, epoch=2) - filepath = tmp_path / "dkt+.params" + filepath = tmp_path / "akt.params" akt.save(filepath) akt.load(filepath) From d3c9718b8059f3e78d95fc51c3747188a026819c Mon Sep 17 00:00:00 2001 From: sone Date: Tue, 27 Sep 2022 18:06:02 +0800 Subject: [PATCH 3/3] [feat] add OKT --- EduKTM/OKT/OKT.py | 193 ++++++++++++++++++++++++ EduKTM/OKT/OKTNet.py | 89 +++++++++++ EduKTM/OKT/__init__.py | 4 + EduKTM/OKT/modules.py | 49 ++++++ EduKTM/__init__.py | 1 + README.md | 1 + docs/OKT.md | 3 + examples/OKT/OKT.ipynb | 233 +++++++++++++++++++++++++++++ examples/OKT/OKT.py | 108 +++++++++++++ examples/OKT/load_data.py | 125 ++++++++++++++++ examples/OKT/prepare_dataset.ipynb | 222 +++++++++++++++++++++++++++ tests/okt/__init__.py | 2 + tests/okt/conftest.py | 52 +++++++ tests/okt/test_okt.py | 21 +++ 14 files changed, 1103 insertions(+) create mode 100644 EduKTM/OKT/OKT.py create mode 100644 EduKTM/OKT/OKTNet.py create mode 100644 EduKTM/OKT/__init__.py create mode 100644 EduKTM/OKT/modules.py create mode 100644 docs/OKT.md create mode 100644 examples/OKT/OKT.ipynb create mode 100644 examples/OKT/OKT.py create mode 100644 examples/OKT/load_data.py create mode 100644 examples/OKT/prepare_dataset.ipynb create mode 100644 tests/okt/__init__.py create mode 100644 tests/okt/conftest.py create mode 100644 tests/okt/test_okt.py diff --git a/EduKTM/OKT/OKT.py b/EduKTM/OKT/OKT.py new file mode 100644 index 0000000..2669a02 --- /dev/null +++ b/EduKTM/OKT/OKT.py @@ -0,0 +1,193 @@ +import math +import logging +import torch +import torch.nn as nn +import numpy as np +import tqdm +from sklearn import metrics +from scipy.stats import pearsonr + +from EduKTM import KTM +from .OKTNet import OKTNet + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def binary_entropy(target, pred): + loss = target * np.log(np.maximum(1e-10, pred)) + (1.0 - target) * np.log(np.maximum(1e-10, 1.0 - pred)) + return np.average(loss) * -1.0 + + +def compute_auc(all_target, all_pred): + return metrics.roc_auc_score(all_target, all_pred) + + +def compute_accuracy(all_target, all_pred): + all_pred = all_pred.copy() + all_pred[all_pred > 0.5] = 1.0 + all_pred[all_pred <= 0.5] = 0.0 + return metrics.accuracy_score(all_target, all_pred) + + +def compute_rmse(all_target, all_pred): + return np.sqrt(metrics.mean_squared_error(all_target, all_pred)) + + +def compute_r2(all_target, all_pred): + return np.power(pearsonr(all_target, all_pred)[0], 2) + + +def train_one_epoch(net, optimizer, criterion, batch_size, q_data, a_data, e_data, it_data, at_data=None): + net.train() + n = int(math.ceil(len(e_data) / batch_size)) + shuffled_ind = np.arange(e_data.shape[0]) + np.random.shuffle(shuffled_ind) + q_data = q_data[shuffled_ind] + e_data = e_data[shuffled_ind] + if at_data is not None: + at_data = at_data[shuffled_ind] + a_data = a_data[shuffled_ind] + it_data = it_data[shuffled_ind] + + pred_list = [] + target_list = [] + for idx in tqdm.tqdm(range(n), 'Training'): + optimizer.zero_grad() + + q_one_seq = q_data[idx * batch_size: (idx + 1) * batch_size, :] + e_one_seq = e_data[idx * batch_size: (idx + 1) * batch_size, :] + a_one_seq = a_data[idx * batch_size: (idx + 1) * batch_size, :] + it_one_seq = it_data[idx * batch_size: (idx + 1) * batch_size, :] + + input_q = torch.from_numpy(q_one_seq).long().to(device) + input_e = torch.from_numpy(e_one_seq).long().to(device) + input_a = torch.from_numpy(a_one_seq).long().to(device) + input_it = torch.from_numpy(it_one_seq).long().to(device) + target = torch.from_numpy(a_one_seq).float().to(device) + + input_at = None + if at_data is not None: + at_one_seq = at_data[idx * batch_size: (idx + 1) * batch_size, :] + input_at = torch.from_numpy(at_one_seq).long().to(device) + + pred = net(input_q, input_a, input_e, input_it, input_at) + + mask = input_e[:, 1:] > 0 + masked_pred = pred[:, 1:][mask] + masked_truth = target[:, 1:][mask] + + loss = criterion(masked_pred, masked_truth) + + loss.backward() + + nn.utils.clip_grad_norm_(net.parameters(), max_norm=10) + optimizer.step() + + masked_pred = masked_pred.detach().cpu().numpy() + masked_truth = masked_truth.detach().cpu().numpy() + pred_list.append(masked_pred) + target_list.append(masked_truth) + + all_pred = np.concatenate(pred_list, axis=0) + all_target = np.concatenate(target_list, axis=0) + + loss = binary_entropy(all_target, all_pred) + r2 = compute_r2(all_target, all_pred) + auc = compute_auc(all_target, all_pred) + accuracy = compute_accuracy(all_target, all_pred) + + return loss, r2, auc, accuracy + + +def test_one_epoch(net, batch_size, q_data, a_data, e_data, it_data, at_data=None): + net.eval() + n = int(math.ceil(len(e_data) / batch_size)) + + pred_list = [] + target_list = [] + mask_list = [] + + for idx in tqdm.tqdm(range(n), 'Testing'): + q_one_seq = q_data[idx * batch_size: (idx + 1) * batch_size, :] + e_one_seq = e_data[idx * batch_size: (idx + 1) * batch_size, :] + a_one_seq = a_data[idx * batch_size: (idx + 1) * batch_size, :] + it_one_seq = it_data[idx * batch_size: (idx + 1) * batch_size, :] + + input_q = torch.from_numpy(q_one_seq).long().to(device) + input_e = torch.from_numpy(e_one_seq).long().to(device) + input_a = torch.from_numpy(a_one_seq).long().to(device) + input_it = torch.from_numpy(it_one_seq).long().to(device) + target = torch.from_numpy(a_one_seq).float().to(device) + + input_at = None + if at_data is not None: + at_one_seq = at_data[idx * batch_size: (idx + 1) * batch_size, :] + input_at = torch.from_numpy(at_one_seq).long().to(device) + + with torch.no_grad(): + pred = net(input_q, input_a, input_e, input_it, input_at) + + mask = input_e[:, 1:] > 0 + masked_pred = pred[:, 1:][mask].detach().cpu().numpy() + masked_truth = target[:, 1:][mask].detach().cpu().numpy() + + pred_list.append(masked_pred) + target_list.append(masked_truth) + mask_list.append(mask.long().cpu().numpy()) + + all_pred = np.concatenate(pred_list, axis=0) + all_target = np.concatenate(target_list, axis=0) + mask_list = np.concatenate(mask_list, axis=0) + + loss = binary_entropy(all_target, all_pred) + r2 = compute_r2(all_target, all_pred) + auc = compute_auc(all_target, all_pred) + accuracy = compute_accuracy(all_target, all_pred) + rmse = compute_rmse(all_target, all_pred) + + return loss, rmse, r2, auc, accuracy + + +class OKT(KTM): + def __init__(self, n_at, n_it, n_exercise, n_question, d_e, d_q, d_a, d_at, d_p, d_h, batch_size=64, dropout=0.2): + super(OKT, self).__init__() + + self.okt_net = OKTNet(n_question, n_exercise, n_it, n_at, d_e, d_q, d_a, d_at, d_p, d_h, + dropout=dropout).to(device) + self.batch_size = batch_size + + def train(self, train_data, test_data=None, *, epoch: int, lr=0.002, lr_decay_step=15, lr_decay_rate=0.5, + filepath=None) -> ...: + optimizer = torch.optim.Adam(self.okt_net.parameters(), lr=lr, weight_decay=1e-5) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_decay_step, gamma=lr_decay_rate) + criterion = nn.BCELoss() + best_train_auc, best_test_auc = .0, .0 + + for idx in range(epoch): + train_loss, train_r2, train_auc, train_accuracy = train_one_epoch(self.okt_net, optimizer, criterion, + self.batch_size, *train_data) + print("[Epoch %d] LogisticLoss: %.6f" % (idx, train_loss)) + if train_auc > best_train_auc: + best_train_auc = train_auc + + if test_data is not None: + _, _, test_r2, test_auc, test_accuracy = self.eval(test_data) + print("[Epoch %d] r2: %.6f, auc: %.6f, accuracy: %.6f" % (idx, test_r2, test_auc, test_accuracy)) + scheduler.step() + if test_auc > best_test_auc: + best_test_auc = test_auc + if filepath is not None: + self.save(filepath) + + return best_train_auc, best_test_auc + + def eval(self, test_data) -> ...: + return test_one_epoch(self.okt_net, self.batch_size, *test_data) + + def save(self, filepath) -> ...: + torch.save(self.okt_net.state_dict(), filepath) + logging.info("save parameters to %s" % filepath) + + def load(self, filepath) -> ...: + self.okt_net.load_state_dict(torch.load(filepath, map_location='cpu')) + logging.info("load parameters from %s" % filepath) diff --git a/EduKTM/OKT/OKTNet.py b/EduKTM/OKT/OKTNet.py new file mode 100644 index 0000000..c48cbaf --- /dev/null +++ b/EduKTM/OKT/OKTNet.py @@ -0,0 +1,89 @@ +import torch +import torch.nn as nn +from torch.nn.init import xavier_uniform_ + +from .modules import UKSE, KSE, OTE + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class OKTNet(nn.Module): + def __init__(self, n_skill, n_exercise, n_it, n_at, d_e, d_q, d_a, d_at, d_p, d_h, dropout=0.05): + super(OKTNet, self).__init__() + self.device = device + + self.n_skill = n_skill + self.n_at = n_at + self.d_h = d_h + self.d_a = d_a + + d_it = d_h + self.it_embed = nn.Embedding(n_it + 1, d_it) + xavier_uniform_(self.it_embed.weight) + self.at_embed = nn.Embedding(n_at + 1, d_at) + xavier_uniform_(self.at_embed.weight) + self.answer_embed = nn.Embedding(2, d_a) + xavier_uniform_(self.answer_embed.weight) + self.exercise_embed = nn.Embedding(n_exercise + 1, d_e) + xavier_uniform_(self.exercise_embed.weight) + self.skill_embed = nn.Embedding(n_skill + 1, d_q) + xavier_uniform_(self.skill_embed.weight) + + self.linear_q = nn.Linear(d_e + d_q, d_p) + xavier_uniform_(self.linear_q.weight) + if n_at == 0: + self.linear_x = nn.Linear(d_p + d_a, d_h) + else: + self.linear_x = nn.Linear(d_at + d_p + d_a, d_h) + xavier_uniform_(self.linear_x.weight) + self.ukse = UKSE(d_h) + self.kse = KSE(d_h) + self.ote = OTE(d_it, d_h, d_h) + + self.sig = nn.Sigmoid() + self.tanh = nn.Tanh() + + self.dropout = nn.Dropout(dropout) + + self.predict = nn.Sequential( + nn.Linear(d_h + d_p, 256), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(256, 1) + ) + + def forward(self, kc_data, a_data, e_data, it_data, at_data): + # prepare data + batch_size, seq_len = kc_data.size(0), kc_data.size(1) + + E = self.exercise_embed(e_data) + KC = self.skill_embed(kc_data) + IT = self.it_embed(it_data) + Ans = self.answer_embed(a_data) + Q = self.linear_q(torch.cat((E, KC), 2)) + if self.n_at == 0: + X = self.linear_x(torch.cat((Q, Ans), 2)) + else: + AT = self.at_embed(at_data) + X = self.linear_x(torch.cat((Q, Ans, AT), 2)) + + previous_h = xavier_uniform_(torch.zeros(1, self.d_h)).repeat(batch_size, 1).to(self.device) + v = xavier_uniform_(torch.empty(1, self.d_h)).repeat(batch_size, 1).to(self.device) + pred = torch.zeros(batch_size, seq_len, 1).to(self.device) + + for t in range(seq_len): + it_embed = IT[:, t] + q = Q[:, t] + x = X[:, t] + + # predict + updated_h = self.ukse(previous_h, v, it_embed) + pred[:, t] = self.sig(self.predict(torch.cat((updated_h, q), 1))) + + # update + h = self.kse(x, updated_h) + v = self.ote(previous_h, h, it_embed, v) + + previous_h = h + + return pred.squeeze(-1) diff --git a/EduKTM/OKT/__init__.py b/EduKTM/OKT/__init__.py new file mode 100644 index 0000000..db09923 --- /dev/null +++ b/EduKTM/OKT/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 +# 2022/9/27 @ sone + +from .OKT import OKT diff --git a/EduKTM/OKT/modules.py b/EduKTM/OKT/modules.py new file mode 100644 index 0000000..2042161 --- /dev/null +++ b/EduKTM/OKT/modules.py @@ -0,0 +1,49 @@ +import torch +import torch.nn as nn +from torch.nn.init import xavier_uniform_ + + +class OTE(nn.Module): + def __init__(self, d_h, d_it, d_v): + super(OTE, self).__init__() + self.linear_v = nn.Linear(2 * d_h + d_it, d_v) + self.linear_p = nn.Linear(d_it + d_v, d_v) + self.sig = nn.Sigmoid() + self.tanh = nn.Tanh() + + def forward(self, previous_h, h, it, v): + delta = torch.cat((previous_h, h), 1) + v_prime = self.tanh(self.linear_v(torch.cat((delta, it), 1))) + p = self.sig(self.linear_p(torch.cat((v, it), 1))) + v = (1 - p) * v + p * v_prime + return v + + +class UKSE(nn.Module): + def __init__(self, d_h): + super(UKSE, self).__init__() + self.linear_h = nn.Linear(3 * d_h, d_h) + self.linear_p = nn.Linear(3 * d_h, d_h) + self.sig = nn.Sigmoid() + self.tanh = nn.Tanh() + + def forward(self, h, v, it): + h_prime = self.tanh(self.linear_h(torch.cat((h, v, it), 1))) + p = self.sig(self.linear_p(torch.cat((h, v, it), 1))) + return (1 - p) * h + p * h_prime + + +class KSE(nn.Module): + def __init__(self, d_h): + super(KSE, self).__init__() + self.linear_h = nn.Linear(2 * d_h, d_h) + self.linear_p = nn.Linear(2 * d_h, d_h) + + self.sig = nn.Sigmoid() + self.tanh = nn.Tanh() + + def forward(self, x, hr): + h_tilde = self.tanh(self.linear_h(torch.cat((x, hr), 1))) + p = self.sig(self.linear_p(torch.cat((x, hr), 1))) + hx = (1 - p) * hr + p * h_tilde + return hx diff --git a/EduKTM/__init__.py b/EduKTM/__init__.py index 6dc7d36..d968c34 100644 --- a/EduKTM/__init__.py +++ b/EduKTM/__init__.py @@ -10,3 +10,4 @@ from .LPKT import LPKT from .GKT import GKT from .DKVMN import DKVMN +from .OKT import OKT diff --git a/README.md b/README.md index 67167dc..39cc62f 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Knowledge Tracing (KT), which aims to monitor students’ evolving knowledge sta * [GKT](EduKTM/GKT)[[doc]](docs/GKT.md) [[example]](examples/GKT) * [AKT](EduKTM/AKT) [[doc]](docs/AKT.md) [[example]](examples/AKT) * [LPKT](EduKTM/LPKT) [[doc]](docs/LPKT.md) [[example]](examples/LPKT) +* [OKT](EduKTM/OKT) [[doc]](docs/OKT.md) [[example]](examples/OKT) ## Contribute diff --git a/docs/OKT.md b/docs/OKT.md new file mode 100644 index 0000000..f1e0316 --- /dev/null +++ b/docs/OKT.md @@ -0,0 +1,3 @@ +# Offline-aware Knowledge Tracing(OKT) + +The details of OKT will be given after the paper is published. diff --git a/examples/OKT/OKT.ipynb b/examples/OKT/OKT.ipynb new file mode 100644 index 0000000..feba37e --- /dev/null +++ b/examples/OKT/OKT.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Offline-aware Knowledge Tracing (OKT)\n", + "\n", + "This notebook will show you how to train and use the OKT.\n", + "First, we will show how to get the data (here we use assistment-2017 as the dataset).\n", + "Then we will show how to train a LPKT and perform the parameters persistence.\n", + "At last, we will show how to load the parameters from the file and evaluate on the test dataset.\n", + "\n", + "The script version could be found in [OKT.py](OKT.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Data Preparation\n", + "\n", + "Before we process the data, we need to first acquire the dataset which is shown in [prepare_dataset.ipynb](prepare_dataset.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from load_data import DATA\n", + "\n", + "dataset_name = 'assist2017'\n", + "\n", + "# dataset detail\n", + "n_at = 1326\n", + "n_it = 2873\n", + "n_exercise = 3162\n", + "n_question = 102\n", + "seq_len = 500\n", + "dataset_path = 'anonymized_full_release_competition_dataset'\n", + "has_at = True\n", + "\n", + "# hyperparameters\n", + "d_q, d_e = 32, 128\n", + "d_p, d_a = 128, 128\n", + "d_at = 50\n", + "d_h = 128\n", + "\n", + "# train detail\n", + "dropout = 0.3\n", + "batch_size = 64\n", + "lr = 3e-3\n", + "lr_decay_step = 10\n", + "lr_decay_rate = 0.5\n", + "epoch = 2\n", + "\n", + "data_path = './data/' + dataset_path\n", + "dat = DATA(seqlen=seq_len, separate_char=',', has_at=has_at)\n", + "train_data = dat.load_data(data_path + '/train0.txt')\n", + "valid_data = dat.load_data(data_path + '/valid0.txt')\n", + "test_data = dat.load_data(data_path + '/test.txt')\n", + "model_file_path = 'okt-' + dataset_name + '.params'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training and Persistence" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.getLogger().setLevel(logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training: 100%|██████████| 28/28 [07:09<00:00, 15.32s/it]\n", + "Testing: 0%| | 0/7 [00:00 self.seqlen: + n_split = math.floor(len(A) / self.seqlen) + if total_len % self.seqlen: + n_split = n_split + 1 + + for k in range(n_split): + question_sequence = [] + answer_sequence = [] + exercise_sequence = [] + it_sequence = [] + at_sequence = [] + if k == n_split - 1: + end_index = total_len + else: + end_index = (k + 1) * self.seqlen + # choose the sequence length is larger than 2 + if end_index - k * self.seqlen > 2: + for i in range(k * self.seqlen, end_index): + question_sequence.append(int(Q[i])) + answer_sequence.append(int(A[i])) + exercise_sequence.append(int(E[i])) + it_sequence.append(int(IT[i])) + if self.has_at: + at_sequence.append(int(AT[i])) + + q_data.append(question_sequence) + a_data.append(answer_sequence) + e_data.append(exercise_sequence) + it_data.append(it_sequence) + at_data.append(at_sequence) + f_data.close() + # data: [[],[],[],...] <-- set_max_seqlen is used + # convert data into ndarrays for better speed during training + q_dataArray = np.zeros((len(q_data), self.seqlen)) + for j in range(len(q_data)): + dat = q_data[j] + q_dataArray[j, :len(dat)] = dat + + a_dataArray = np.zeros((len(a_data), self.seqlen)) + for j in range(len(a_data)): + dat = a_data[j] + a_dataArray[j, :len(dat)] = dat + + e_dataArray = np.zeros((len(e_data), self.seqlen)) + for j in range(len(e_data)): + dat = e_data[j] + e_dataArray[j, :len(dat)] = dat + + it_dataArray = np.zeros((len(it_data), self.seqlen)) + for j in range(len(it_data)): + dat = it_data[j] + it_dataArray[j, :len(dat)] = dat + + at_dataArray = np.zeros((len(at_data), self.seqlen)) + for j in range(len(at_data)): + dat = at_data[j] + at_dataArray[j, :len(dat)] = dat + + selection = { + 'q': q_dataArray, + 'a': a_dataArray, + 'e': e_dataArray, + 'it': it_dataArray, + 'at': at_dataArray, + } + res = [] + if self.selection_keys is None: + res = selection.values() + else: + for k in self.selection_keys: + res.append(selection[k]) + return tuple(res) diff --git a/examples/OKT/prepare_dataset.ipynb b/examples/OKT/prepare_dataset.ipynb new file mode 100644 index 0000000..e2903a1 --- /dev/null +++ b/examples/OKT/prepare_dataset.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split, KFold\n", + "from EduData import get_data\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tqdm\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "path = './data/anonymized_full_release_competition_dataset'\n", + "\n", + "if not os.path.exists(path + '/anonymized_full_release_competition_dataset.csv'):\n", + " get_data(\"assistment-2017\", \"./data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\n", + " path + '/anonymized_full_release_competition_dataset.csv',\n", + " usecols=['startTime', 'endTime', 'timeTaken', 'studentId', 'skill', 'problemId', 'correct']\n", + ").dropna(subset=['skill', 'problemId'])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "data.timeTaken = data.timeTaken.astype(int)\n", + "\n", + "skills = data.skill.unique().tolist()\n", + "problems = data.problemId.unique().tolist()\n", + "at = data.timeTaken.unique()\n", + "user_seqs = [u.sort_values('endTime') for _, u in list(data.groupby('studentId'))]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# question id from 1 to #num_skill\n", + "skill2id = {p: i + 1 for i, p in enumerate(skills)}\n", + "problem2id = {p: i + 1 for i, p in enumerate(problems)}\n", + "at2id = {a: i for i, a in enumerate(at)}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "it = set()\n", + "avg_it = np.array([])\n", + "# calculate interval time\n", + "for i, seq in enumerate(user_seqs):\n", + " seq = seq.copy()\n", + " items = seq.endTime.diff(1) // 60\n", + " items.iloc[0] = 0\n", + " items = items.astype(int)\n", + " items[items > 43200] = 43200\n", + " seq['it'] = items\n", + " user_seqs[i] = seq\n", + " for item in items.unique():\n", + " it.add(item)\n", + "\n", + "it2id = {a: i for i, a in enumerate(it)}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "parse student sequence:\t: 100%|██████████| 1709/1709 [00:01<00:00, 1439.26it/s]\n" + ] + } + ], + "source": [ + "def parse_all_seq(students):\n", + " all_sequences = []\n", + " for seq in tqdm.tqdm(students, 'parse student sequence:\\t'):\n", + " student_sequence = parse_student_seq(seq)\n", + " all_sequences.extend([student_sequence])\n", + " return all_sequences\n", + "\n", + "\n", + "def parse_student_seq(student):\n", + " seq = student\n", + " s = [skill2id[q] for q in seq.skill.tolist()]\n", + " a = seq.correct.tolist()\n", + " p = [problem2id[p] for p in seq.problemId.tolist()]\n", + " it = [it2id[int(x)] for x in seq.it.tolist()]\n", + " at = [at2id[int(x)] for x in seq.timeTaken.tolist()]\n", + " return s, a, p, it, at\n", + "\n", + "sequences = parse_all_seq(user_seqs)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "parse student sequence:\t: 100%|██████████| 1709/1709 [00:00<00:00, 2218.42it/s]\n" + ] + } + ], + "source": [ + "sequences = parse_all_seq(user_seqs)\n", + "\n", + "# split train data and test data\n", + "train_data, test_data = train_test_split(sequences, test_size=.2, random_state=5)\n", + "train_data = np.array(train_data, dtype=object)\n", + "test_data = np.array(test_data, dtype=object)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "write data into file ./data/anonymized_full_release_competition_dataset/train0.txt: 100%|██████████| 1093/1093 [00:00<00:00, 1824.67it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/valid0.txt: 100%|██████████| 274/274 [00:00<00:00, 2092.50it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/train1.txt: 100%|██████████| 1093/1093 [00:00<00:00, 2009.51it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/valid1.txt: 100%|██████████| 274/274 [00:00<00:00, 2070.91it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/train2.txt: 100%|██████████| 1094/1094 [00:00<00:00, 2044.91it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/valid2.txt: 100%|██████████| 273/273 [00:00<00:00, 1515.39it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/train3.txt: 100%|██████████| 1094/1094 [00:00<00:00, 1903.32it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/valid3.txt: 100%|██████████| 273/273 [00:00<00:00, 1799.80it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/train4.txt: 100%|██████████| 1094/1094 [00:00<00:00, 1738.26it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/valid4.txt: 100%|██████████| 273/273 [00:00<00:00, 1942.51it/s]\n", + "write data into file ./data/anonymized_full_release_competition_dataset/test.txt: 100%|██████████| 342/342 [00:00<00:00, 1950.58it/s]\n" + ] + } + ], + "source": [ + "def sequences2l(sequences, trg_path):\n", + " with open(trg_path, 'a', encoding='utf8') as f:\n", + " for seq in tqdm.tqdm(sequences, 'write data into file %s' % trg_path):\n", + " s_seq, a_seq, p_seq, it_seq, at_seq = seq\n", + " seq_len = len(s_seq)\n", + " f.write(str(seq_len) + '\\n')\n", + " f.write(','.join([str(s) for s in s_seq]) + '\\n')\n", + " f.write(','.join([str(a) for a in a_seq]) + '\\n')\n", + " f.write(','.join([str(p) for p in p_seq]) + '\\n')\n", + " f.write(','.join([str(i) for i in it_seq]) + '\\n')\n", + " f.write(','.join([str(a) for a in at_seq]) + '\\n')\n", + "\n", + "\n", + "# split into 5 folds\n", + "kfold = KFold(n_splits=5, shuffle=True, random_state=5)\n", + "idx = 0\n", + "for train_data_1, valid_data in kfold.split(train_data):\n", + " sequences2l(train_data[train_data_1], path + '/train' + str(idx) + '.txt')\n", + " sequences2l(train_data[valid_data], path + '/valid' + str(idx) + '.txt')\n", + " idx += 1\n", + "\n", + "sequences2l(test_data, path + '/test.txt')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.3 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/okt/__init__.py b/tests/okt/__init__.py new file mode 100644 index 0000000..a199655 --- /dev/null +++ b/tests/okt/__init__.py @@ -0,0 +1,2 @@ +# coding: utf-8 +# 2022/9/27 @ zengxiaonan diff --git a/tests/okt/conftest.py b/tests/okt/conftest.py new file mode 100644 index 0000000..3a5ec4b --- /dev/null +++ b/tests/okt/conftest.py @@ -0,0 +1,52 @@ +# coding: utf-8 +# 2021/9/27 @ zengxiaonan +import random +import numpy as np +import pytest + + +@pytest.fixture(scope="package", params=[0, 32]) +def conf(request): + batch_size = 16 + n_at = request.param + n_it = 32 + n_skill = 8 + n_exercise = 32 + + return n_at, n_it, n_skill, n_exercise, batch_size + + +@pytest.fixture(scope="package") +def data(conf): + n_at, n_it, n_skill, n_exercise, batch_size = conf + seqlen = 10 + + s = [ + [random.randint(1, n_skill) for _ in range(seqlen)] + for _ in range(batch_size) + ] + a = [ + [random.randint(0, 1) for _ in range(seqlen)] + for _ in range(batch_size) + ] + e = [ + [random.randint(1, n_exercise) for _ in range(seqlen)] + for _ in range(batch_size) + ] + it = [ + [random.randint(1, n_it) for _ in range(seqlen)] + for _ in range(batch_size) + ] + at = None + if n_at != 0: + at = [ + [random.randint(1, n_at) for _ in range(seqlen)] + for _ in range(batch_size) + ] + + if n_at != 0: + data = (np.array(s), np.array(a), np.array(e), np.array(it), np.array(at)) + else: + data = (np.array(s), np.array(a), np.array(e), np.array(it)) + + return data diff --git a/tests/okt/test_okt.py b/tests/okt/test_okt.py new file mode 100644 index 0000000..86fd8a5 --- /dev/null +++ b/tests/okt/test_okt.py @@ -0,0 +1,21 @@ +# coding: utf-8 +# 2021/9/27 @ zengxiaonan + +from EduKTM import OKT + + +def test_train(data, conf, tmp_path): + n_at, n_it, n_question, n_exercise, batch_size = conf + d_e = 32 + d_q = 16 + d_a = 16 + d_at = 10 + d_p = 16 + d_h = 32 + dropout = 0.2 + + okt = OKT(n_at, n_it, n_exercise, n_question, d_e, d_q, d_a, d_at, d_p, d_h, + batch_size=batch_size, dropout=dropout) + filepath = tmp_path / "okt.params" + okt.train(data, test_data=data, epoch=2, filepath=filepath) + okt.load(filepath)