From fb365f46520087dbd3bd2a1d725ed69ce1e60fd4 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 12:30:30 +0530
Subject: [PATCH 01/18] Removes old commits

---
 .gitignore       | 3 ++-
 requirements.txt | 8 ++++----
 train.cfg        | 4 ++--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index ccd3731..9e82841 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,9 @@
+data/
 experiments/
-data/rotowire/
 *.txt
 __pycache__/
 .ipynb_checkpoints/
 *.pyc
 *.bz2
 *.tar
+*.DS_Store
diff --git a/requirements.txt b/requirements.txt
index 4810274..fede69e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ bleach==3.3.0
 certifi==2019.11.28
 cffi==1.13.2
 chardet==3.0.4
-ConfigArgParse==0.14.0
+ConfigArgParse
 cycler==0.10.0
 decorator==4.4.1
 defusedxml==0.6.0
@@ -17,7 +17,7 @@ ipython-genutils==0.2.0
 ipywidgets==7.5.1
 jedi==0.15.1
 Jinja2==2.11.3
-json5==0.8.5
+json5
 jsonschema==3.2.0
 jupyter==1.0.0
 jupyter-client==5.3.4
@@ -47,7 +47,7 @@ Pygments==2.7.4
 pyparsing==2.4.5
 pyrsistent==0.15.6
 python-dateutil==2.8.1
-PyYAML==5.4
+PyYAML
 pyzmq==18.1.1
 qtconsole==4.6.0
 requests==2.22.0
@@ -56,7 +56,7 @@ simplejson==3.17.0
 six==1.13.0
 terminado==0.8.3
 testpath==0.4.4
-torch==1.1.0
+torch
 torchtext==0.4.0
 tornado==6.0.3
 tqdm==4.40.2
diff --git a/train.cfg b/train.cfg
index 91f494f..b341fb3 100644
--- a/train.cfg
+++ b/train.cfg
@@ -61,7 +61,7 @@ save_checkpoint_steps: 500  # save a cp every X steps
 
 
 # Gpu related:
-gpu_ranks: [0]  # ids of gpus to use
+gpu_ranks: []  # ids of gpus to use
 world_size: 1  # total number of distributed processes
 gpu_backend: nccl  # type of torch distributed backend
 gpu_verbose_level: 0
@@ -77,7 +77,7 @@ normalization: sents
 accum_count: [2]  # Update weights every X batches
 accum_steps: [0]  # steps at which accum counts value changes
 valid_steps: 500 # run models on validation set every X steps
-train_steps: 30000
+train_steps: 100
 optim: adam
 max_grad_norm: 5
 dropout: .5

From be6d39d77220dd68693ac8e3072c6635cd19c6ff Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 19:40:57 +0530
Subject: [PATCH 02/18] updates deps

---
 .gitignore       | 1 +
 requirements.txt | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9e82841..01d2b43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__/
 *.bz2
 *.tar
 *.DS_Store
+*.idea
diff --git a/requirements.txt b/requirements.txt
index fede69e..4fc90a7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ lab==5.1
 MarkupSafe==1.1.1
 matplotlib==3.1.2
 mistune==0.8.4
-more-itertools==8.0.2
+more-itertools
 nbconvert==5.6.1
 nbformat==4.4.0
 notebook==6.1.5
@@ -48,7 +48,7 @@ pyparsing==2.4.5
 pyrsistent==0.15.6
 python-dateutil==2.8.1
 PyYAML
-pyzmq==18.1.1
+pyzmq
 qtconsole==4.6.0
 requests==2.22.0
 Send2Trash==1.5.0

From 63fe915cc22d71bbfc23a6ba396a714b28134e25 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 19:43:02 +0530
Subject: [PATCH 03/18] updates

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4fc90a7..713d3b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ attrs==19.3.0
 backcall==0.1.0
 bleach==3.3.0
 certifi==2019.11.28
-cffi==1.13.2
+cffi
 chardet==3.0.4
 ConfigArgParse
 cycler==0.10.0
@@ -40,7 +40,7 @@ parso==0.5.2
 pexpect==4.7.0
 pickleshare==0.7.5
 prometheus-client==0.7.1
-prompt-toolkit==2.0.10
+prompt-toolkit
 ptyprocess==0.6.0
 pycparser==2.19
 Pygments==2.7.4

From d8d09fa90e6abea335735320488a85d4976d9027 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 19:44:21 +0530
Subject: [PATCH 04/18] u

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 713d3b8..0c6677e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,7 +34,7 @@ more-itertools
 nbconvert==5.6.1
 nbformat==4.4.0
 notebook==6.1.5
-numpy==1.17.4
+numpy
 pandocfilters==1.4.2
 parso==0.5.2
 pexpect==4.7.0

From 2c8a8a9f9d277cd3b316b10ce09dfd19ac71cf20 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 19:47:45 +0530
Subject: [PATCH 05/18] remove all pinned dep

---
 requirements.txt | 118 +++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0c6677e..1d621c0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,68 +1,68 @@
-attrs==19.3.0
-backcall==0.1.0
-bleach==3.3.0
-certifi==2019.11.28
+attrs
+backcall
+bleach
+certifi
 cffi
-chardet==3.0.4
+chardet
 ConfigArgParse
-cycler==0.10.0
-decorator==4.4.1
-defusedxml==0.6.0
-entrypoints==0.3
-idna==2.8
-importlib-metadata==1.3.0
-ipykernel==5.1.3
-ipython==7.10.2
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-jedi==0.15.1
-Jinja2==2.11.3
+cycler
+decorator
+defusedxml
+entrypoints
+idna
+importlib-metadata
+ipykernel
+ipython
+ipython-genutils
+ipywidgets
+jedi
+Jinja2
 json5
-jsonschema==3.2.0
-jupyter==1.0.0
-jupyter-client==5.3.4
-jupyter-console==6.0.0
-jupyter-core==4.6.1
-jupyterlab==1.2.4
-jupyterlab-server==1.0.6
-kiwisolver==1.1.0
-lab==5.1
-MarkupSafe==1.1.1
-matplotlib==3.1.2
-mistune==0.8.4
+jsonschema
+jupyter
+jupyter-client
+jupyter-console
+jupyter-core
+jupyterlab
+jupyterlab-server
+kiwisolver
+lab
+MarkupSafe
+matplotlib
+mistune
 more-itertools
-nbconvert==5.6.1
-nbformat==4.4.0
-notebook==6.1.5
+nbconvert
+nbformat
+notebook
 numpy
-pandocfilters==1.4.2
-parso==0.5.2
-pexpect==4.7.0
-pickleshare==0.7.5
-prometheus-client==0.7.1
+pandocfilters
+parso
+pexpect
+pickleshare
+prometheus-client
 prompt-toolkit
-ptyprocess==0.6.0
-pycparser==2.19
-Pygments==2.7.4
-pyparsing==2.4.5
-pyrsistent==0.15.6
-python-dateutil==2.8.1
+ptyprocess
+pycparser
+Pygments
+pyparsing
+pyrsistent
+python-dateutil
 PyYAML
 pyzmq
-qtconsole==4.6.0
-requests==2.22.0
-Send2Trash==1.5.0
-simplejson==3.17.0
-six==1.13.0
-terminado==0.8.3
-testpath==0.4.4
+qtconsole
+requests
+Send2Trash
+simplejson
+six
+terminado
+testpath
 torch
-torchtext==0.4.0
-tornado==6.0.3
-tqdm==4.40.2
-traitlets==4.3.3
-urllib3==1.25.7
-wcwidth==0.1.7
-webencodings==0.5.1
-widgetsnbextension==3.5.1
-zipp==0.6.0
+torchtext
+tornado
+tqdm
+traitlets
+urllib3
+wcwidth
+webencodings
+widgetsnbextension
+zipp

From 3d0ec0d1f1281f4c55368fa6d0cbfdbf32fcb6c0 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 20:12:01 +0530
Subject: [PATCH 06/18] updates

---
 onmt/modules/self_attention.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/onmt/modules/self_attention.py b/onmt/modules/self_attention.py
index 4c46b6d..3d2692e 100644
--- a/onmt/modules/self_attention.py
+++ b/onmt/modules/self_attention.py
@@ -4,7 +4,6 @@
 It's actually the same module, with more or less flewibility at times,
 and a more flexible use of the mask (different mask per element of the batch)
 """
-from torch._jit_internal import weak_module, weak_script_method
 from torch.nn.init import constant_
 from torch.nn.parameter import Parameter
 from torch.nn.init import xavier_uniform_
@@ -14,7 +13,6 @@
 import torch
 
 
-@weak_module
 class MultiHeadSelfAttention(torch.nn.Module):
     """
     if glu_depth is not zero, we use GatedLinear layers instead of regular layers.
@@ -59,7 +57,6 @@ def _reset_parameters(self):
             constant_(self.in_proj_bias, 0.)
             constant_(self.out_proj.bias, 0.)
 
-    @weak_script_method
     def forward(self, input, attn_mask=None):
         """
         Inputs of forward function

From e62ea67e115bd83c69b03b7a68d5c7522b2e0c08 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 20:17:42 +0530
Subject: [PATCH 07/18] Updates self attention

---
 onmt/modules/self_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onmt/modules/self_attention.py b/onmt/modules/self_attention.py
index 3d2692e..e0cf625 100644
--- a/onmt/modules/self_attention.py
+++ b/onmt/modules/self_attention.py
@@ -73,7 +73,7 @@ def forward(self, input, attn_mask=None):
 
         # self-attention
         q, k, v = F.linear(input, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
-        q *= self.scaling
+        q = q * self.scaling
         
         # Cut q, k, v in num_heads part
         q = q.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)

From 09d93189a3a7ea1ec2cc7687880e0c7455621703 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kthakkar@lyft.com>
Date: Sat, 25 Nov 2023 21:34:16 +0530
Subject: [PATCH 08/18] log loss more often

---
 train.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.cfg b/train.cfg
index b341fb3..b516fed 100644
--- a/train.cfg
+++ b/train.cfg
@@ -56,7 +56,7 @@ data: experiments/exp-1/data/data  # path to datafile from preprocess.py
 save_model: experiments/exp-1/models/model  # path to store checkpoints
 log_file: experiments/exp-1/train-log.txt
 
-report_every: 50  # log current loss every X steps
+report_every: 10  # log current loss every X steps
 save_checkpoint_steps: 500  # save a cp every X steps
 
 

From 86ffe215b206200bdd0880c15b8b32afd6dd79d4 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:14:53 -0500
Subject: [PATCH 09/18] print statements

---
 onmt/translate/beam_search.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index d0ea981..f62c99c 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,6 +187,7 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
+        print(type(self.topk_ids), type(self._batch_index))
         torch.div(self.topk_ids, vocab_size, out=self._batch_index)
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)

From f3e4cf494ecdc4fb8e64e15427245ba195e29525 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:16:35 -0500
Subject: [PATCH 10/18] fix

---
 onmt/translate/beam_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index f62c99c..57952a2 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,7 +187,7 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        print(type(self.topk_ids), type(self._batch_index))
+        print(self.topk_ids.type(), self._batch_index.type())
         torch.div(self.topk_ids, vocab_size, out=self._batch_index)
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)

From 67b18b901c172ac52a166942735cf3502f8ebf81 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:19:19 -0500
Subject: [PATCH 11/18] Fix

---
 onmt/translate/beam_search.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index 57952a2..4a4d714 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,8 +187,7 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        print(self.topk_ids.type(), self._batch_index.type())
-        torch.div(self.topk_ids, vocab_size, out=self._batch_index)
+        self._batch_index = torch.div(self.topk_ids, vocab_size, out=self._batch_index.float())
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids

From 9c63c666452ec2c4f9d869ea1da61b058e531bc3 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:20:55 -0500
Subject: [PATCH 12/18] stab

---
 onmt/translate/beam_search.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index 4a4d714..f905ff6 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,9 +187,10 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        self._batch_index = torch.div(self.topk_ids, vocab_size, out=self._batch_index.float())
+        torch.div(self.topk_ids, vocab_size, out=self._batch_index.float())
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
+        print(self.select_indices.type())
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids
 
         # Append last prediction.

From 353fb16a66e2e7f2f70b4fe63f1775db8c3a4c21 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:23:28 -0500
Subject: [PATCH 13/18] debug

---
 onmt/translate/beam_search.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index f905ff6..5f07979 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -190,10 +190,11 @@ def advance(self, log_probs, attn, attn_key):
         torch.div(self.topk_ids, vocab_size, out=self._batch_index.float())
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
-        print(self.select_indices.type())
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids
 
         # Append last prediction.
+        print(self.alive_seq.shape)
+        print(self.select_indices)
         self.alive_seq = torch.cat(
             [self.alive_seq.index_select(0, self.select_indices),
              self.topk_ids.view(_B * self.beam_size, 1)], -1)

From 630eedc5ed3f219ae9f96cdb3bc86d4f16ad9cfc Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:27:30 -0500
Subject: [PATCH 14/18] fix

---
 onmt/translate/beam_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index 5f07979..7eb8eb1 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,7 +187,7 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        torch.div(self.topk_ids, vocab_size, out=self._batch_index.float())
+        self._batch_index = torch.div(self.topk_ids, vocab_size)
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids

From adec64dc38cb6bbff638c928e86f58ed06247c1f Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:29:28 -0500
Subject: [PATCH 15/18] cast to int

---
 onmt/translate/beam_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index 7eb8eb1..792f530 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,7 +187,7 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        self._batch_index = torch.div(self.topk_ids, vocab_size)
+        self._batch_index = torch.div(self.topk_ids, vocab_size).int()
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids

From 1422be6700aff7ca6825feb1fb51badd678887a8 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:31:00 -0500
Subject: [PATCH 16/18] stab

---
 onmt/translate/beam_search.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index 792f530..dbce56b 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,14 +187,13 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
+        print(self.topk_ids)
         self._batch_index = torch.div(self.topk_ids, vocab_size).int()
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids
 
         # Append last prediction.
-        print(self.alive_seq.shape)
-        print(self.select_indices)
         self.alive_seq = torch.cat(
             [self.alive_seq.index_select(0, self.select_indices),
              self.topk_ids.view(_B * self.beam_size, 1)], -1)

From b90b2d85c617dcbe19eee736f69036ecbbb0d4f7 Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Sun, 3 Dec 2023 23:31:55 -0500
Subject: [PATCH 17/18] removes print

---
 onmt/translate/beam_search.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index dbce56b..d9471f5 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,7 +187,6 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        print(self.topk_ids)
         self._batch_index = torch.div(self.topk_ids, vocab_size).int()
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)

From e420eba0e7a32d456a54a40de2c3cd3e91f312ff Mon Sep 17 00:00:00 2001
From: Kedar Thakkar <kedarthakkar@berkeley.edu>
Date: Mon, 4 Dec 2023 09:27:01 -0500
Subject: [PATCH 18/18] Adds HF dataset

---
 finetuning/make_hugging_face_dataset.py | 6 ++++++
 requirements.txt                        | 1 +
 2 files changed, 7 insertions(+)
 create mode 100644 finetuning/make_hugging_face_dataset.py

diff --git a/finetuning/make_hugging_face_dataset.py b/finetuning/make_hugging_face_dataset.py
new file mode 100644
index 0000000..ace1d22
--- /dev/null
+++ b/finetuning/make_hugging_face_dataset.py
@@ -0,0 +1,6 @@
+from datasets import load_dataset
+
+def parse_hugging_face_dataset():
+    hf_data = load_dataset(GEM/sportsett_basketball)
+    with open('../data/hugging_face/train_input.txt', 'w') as f:
+        f.write(hf_data)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 1d621c0..1bb6045 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ cffi
 chardet
 ConfigArgParse
 cycler
+datasets
 decorator
 defusedxml
 entrypoints