diff --git a/.gitignore b/.gitignore
index ccd3731..01d2b43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,10 @@
+data/
 experiments/
-data/rotowire/
 *.txt
 __pycache__/
 .ipynb_checkpoints/
 *.pyc
 *.bz2
 *.tar
+*.DS_Store
+*.idea
diff --git a/finetuning/make_hugging_face_dataset.py b/finetuning/make_hugging_face_dataset.py
new file mode 100644
index 0000000..ace1d22
--- /dev/null
+++ b/finetuning/make_hugging_face_dataset.py
@@ -0,0 +1,6 @@
+from datasets import load_dataset
+
+def parse_hugging_face_dataset():
+    hf_data = load_dataset(GEM/sportsett_basketball)
+    with open('../data/hugging_face/train_input.txt', 'w') as f:
+        f.write(hf_data)
\ No newline at end of file
diff --git a/onmt/modules/self_attention.py b/onmt/modules/self_attention.py
index 4c46b6d..e0cf625 100644
--- a/onmt/modules/self_attention.py
+++ b/onmt/modules/self_attention.py
@@ -4,7 +4,6 @@
 It's actually the same module, with more or less flewibility at times,
 and a more flexible use of the mask (different mask per element of the batch)
 """
-from torch._jit_internal import weak_module, weak_script_method
 from torch.nn.init import constant_
 from torch.nn.parameter import Parameter
 from torch.nn.init import xavier_uniform_
@@ -14,7 +13,6 @@
 import torch
 
 
-@weak_module
 class MultiHeadSelfAttention(torch.nn.Module):
     """
     if glu_depth is not zero, we use GatedLinear layers instead of regular layers.
@@ -59,7 +57,6 @@ def _reset_parameters(self):
             constant_(self.in_proj_bias, 0.)
             constant_(self.out_proj.bias, 0.)
 
-    @weak_script_method
     def forward(self, input, attn_mask=None):
         """
         Inputs of forward function
@@ -76,7 +73,7 @@ def forward(self, input, attn_mask=None):
 
         # self-attention
         q, k, v = F.linear(input, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
-        q *= self.scaling
+        q = q * self.scaling
         
         # Cut q, k, v in num_heads part
         q = q.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py
index d0ea981..d9471f5 100644
--- a/onmt/translate/beam_search.py
+++ b/onmt/translate/beam_search.py
@@ -187,7 +187,7 @@ def advance(self, log_probs, attn, attn_key):
         torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
 
         # Resolve beam origin and map to batch index flat representation.
-        torch.div(self.topk_ids, vocab_size, out=self._batch_index)
+        self._batch_index = torch.div(self.topk_ids, vocab_size).int()
         self._batch_index += self._beam_offset[:_B].unsqueeze(1)
         self.select_indices = self._batch_index.view(_B * self.beam_size)
         self.topk_ids.fmod_(vocab_size)  # resolve true word ids
diff --git a/requirements.txt b/requirements.txt
index 4810274..1bb6045 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,68 +1,69 @@
-attrs==19.3.0
-backcall==0.1.0
-bleach==3.3.0
-certifi==2019.11.28
-cffi==1.13.2
-chardet==3.0.4
-ConfigArgParse==0.14.0
-cycler==0.10.0
-decorator==4.4.1
-defusedxml==0.6.0
-entrypoints==0.3
-idna==2.8
-importlib-metadata==1.3.0
-ipykernel==5.1.3
-ipython==7.10.2
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-jedi==0.15.1
-Jinja2==2.11.3
-json5==0.8.5
-jsonschema==3.2.0
-jupyter==1.0.0
-jupyter-client==5.3.4
-jupyter-console==6.0.0
-jupyter-core==4.6.1
-jupyterlab==1.2.4
-jupyterlab-server==1.0.6
-kiwisolver==1.1.0
-lab==5.1
-MarkupSafe==1.1.1
-matplotlib==3.1.2
-mistune==0.8.4
-more-itertools==8.0.2
-nbconvert==5.6.1
-nbformat==4.4.0
-notebook==6.1.5
-numpy==1.17.4
-pandocfilters==1.4.2
-parso==0.5.2
-pexpect==4.7.0
-pickleshare==0.7.5
-prometheus-client==0.7.1
-prompt-toolkit==2.0.10
-ptyprocess==0.6.0
-pycparser==2.19
-Pygments==2.7.4
-pyparsing==2.4.5
-pyrsistent==0.15.6
-python-dateutil==2.8.1
-PyYAML==5.4
-pyzmq==18.1.1
-qtconsole==4.6.0
-requests==2.22.0
-Send2Trash==1.5.0
-simplejson==3.17.0
-six==1.13.0
-terminado==0.8.3
-testpath==0.4.4
-torch==1.1.0
-torchtext==0.4.0
-tornado==6.0.3
-tqdm==4.40.2
-traitlets==4.3.3
-urllib3==1.25.7
-wcwidth==0.1.7
-webencodings==0.5.1
-widgetsnbextension==3.5.1
-zipp==0.6.0
+attrs
+backcall
+bleach
+certifi
+cffi
+chardet
+ConfigArgParse
+cycler
+datasets
+decorator
+defusedxml
+entrypoints
+idna
+importlib-metadata
+ipykernel
+ipython
+ipython-genutils
+ipywidgets
+jedi
+Jinja2
+json5
+jsonschema
+jupyter
+jupyter-client
+jupyter-console
+jupyter-core
+jupyterlab
+jupyterlab-server
+kiwisolver
+lab
+MarkupSafe
+matplotlib
+mistune
+more-itertools
+nbconvert
+nbformat
+notebook
+numpy
+pandocfilters
+parso
+pexpect
+pickleshare
+prometheus-client
+prompt-toolkit
+ptyprocess
+pycparser
+Pygments
+pyparsing
+pyrsistent
+python-dateutil
+PyYAML
+pyzmq
+qtconsole
+requests
+Send2Trash
+simplejson
+six
+terminado
+testpath
+torch
+torchtext
+tornado
+tqdm
+traitlets
+urllib3
+wcwidth
+webencodings
+widgetsnbextension
+zipp
diff --git a/train.cfg b/train.cfg
index 91f494f..b516fed 100644
--- a/train.cfg
+++ b/train.cfg
@@ -56,12 +56,12 @@ data: experiments/exp-1/data/data  # path to datafile from preprocess.py
 save_model: experiments/exp-1/models/model  # path to store checkpoints
 log_file: experiments/exp-1/train-log.txt
 
-report_every: 50  # log current loss every X steps
+report_every: 10  # log current loss every X steps
 save_checkpoint_steps: 500  # save a cp every X steps
 
 
 # Gpu related:
-gpu_ranks: [0]  # ids of gpus to use
+gpu_ranks: []  # ids of gpus to use
 world_size: 1  # total number of distributed processes
 gpu_backend: nccl  # type of torch distributed backend
 gpu_verbose_level: 0
@@ -77,7 +77,7 @@ normalization: sents
 accum_count: [2]  # Update weights every X batches
 accum_steps: [0]  # steps at which accum counts value changes
 valid_steps: 500 # run models on validation set every X steps
-train_steps: 30000
+train_steps: 100
 optim: adam
 max_grad_norm: 5
 dropout: .5