diff --git a/.gitignore b/.gitignore index ccd3731..01d2b43 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ +data/ experiments/ -data/rotowire/ *.txt __pycache__/ .ipynb_checkpoints/ *.pyc *.bz2 *.tar +*.DS_Store +*.idea diff --git a/finetuning/make_hugging_face_dataset.py b/finetuning/make_hugging_face_dataset.py new file mode 100644 index 0000000..ace1d22 --- /dev/null +++ b/finetuning/make_hugging_face_dataset.py @@ -0,0 +1,6 @@ +from datasets import load_dataset + +def parse_hugging_face_dataset(): + hf_data = load_dataset(GEM/sportsett_basketball) + with open('../data/hugging_face/train_input.txt', 'w') as f: + f.write(hf_data) \ No newline at end of file diff --git a/onmt/modules/self_attention.py b/onmt/modules/self_attention.py index 4c46b6d..e0cf625 100644 --- a/onmt/modules/self_attention.py +++ b/onmt/modules/self_attention.py @@ -4,7 +4,6 @@ It's actually the same module, with more or less flewibility at times, and a more flexible use of the mask (different mask per element of the batch) """ -from torch._jit_internal import weak_module, weak_script_method from torch.nn.init import constant_ from torch.nn.parameter import Parameter from torch.nn.init import xavier_uniform_ @@ -14,7 +13,6 @@ import torch -@weak_module class MultiHeadSelfAttention(torch.nn.Module): """ if glu_depth is not zero, we use GatedLinear layers instead of regular layers. @@ -59,7 +57,6 @@ def _reset_parameters(self): constant_(self.in_proj_bias, 0.) constant_(self.out_proj.bias, 0.) - @weak_script_method def forward(self, input, attn_mask=None): """ Inputs of forward function @@ -76,7 +73,7 @@ def forward(self, input, attn_mask=None): # self-attention q, k, v = F.linear(input, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1) - q *= self.scaling + q = q * self.scaling # Cut q, k, v in num_heads part q = q.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) diff --git a/onmt/translate/beam_search.py b/onmt/translate/beam_search.py index d0ea981..d9471f5 100644 --- a/onmt/translate/beam_search.py +++ b/onmt/translate/beam_search.py @@ -187,7 +187,7 @@ def advance(self, log_probs, attn, attn_key): torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs) # Resolve beam origin and map to batch index flat representation. - torch.div(self.topk_ids, vocab_size, out=self._batch_index) + self._batch_index = torch.div(self.topk_ids, vocab_size).int() self._batch_index += self._beam_offset[:_B].unsqueeze(1) self.select_indices = self._batch_index.view(_B * self.beam_size) self.topk_ids.fmod_(vocab_size) # resolve true word ids diff --git a/requirements.txt b/requirements.txt index 4810274..1bb6045 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,68 +1,69 @@ -attrs==19.3.0 -backcall==0.1.0 -bleach==3.3.0 -certifi==2019.11.28 -cffi==1.13.2 -chardet==3.0.4 -ConfigArgParse==0.14.0 -cycler==0.10.0 -decorator==4.4.1 -defusedxml==0.6.0 -entrypoints==0.3 -idna==2.8 -importlib-metadata==1.3.0 -ipykernel==5.1.3 -ipython==7.10.2 -ipython-genutils==0.2.0 -ipywidgets==7.5.1 -jedi==0.15.1 -Jinja2==2.11.3 -json5==0.8.5 -jsonschema==3.2.0 -jupyter==1.0.0 -jupyter-client==5.3.4 -jupyter-console==6.0.0 -jupyter-core==4.6.1 -jupyterlab==1.2.4 -jupyterlab-server==1.0.6 -kiwisolver==1.1.0 -lab==5.1 -MarkupSafe==1.1.1 -matplotlib==3.1.2 -mistune==0.8.4 -more-itertools==8.0.2 -nbconvert==5.6.1 -nbformat==4.4.0 -notebook==6.1.5 -numpy==1.17.4 -pandocfilters==1.4.2 -parso==0.5.2 -pexpect==4.7.0 -pickleshare==0.7.5 -prometheus-client==0.7.1 -prompt-toolkit==2.0.10 -ptyprocess==0.6.0 -pycparser==2.19 -Pygments==2.7.4 -pyparsing==2.4.5 -pyrsistent==0.15.6 -python-dateutil==2.8.1 -PyYAML==5.4 -pyzmq==18.1.1 -qtconsole==4.6.0 -requests==2.22.0 -Send2Trash==1.5.0 -simplejson==3.17.0 -six==1.13.0 -terminado==0.8.3 -testpath==0.4.4 -torch==1.1.0 -torchtext==0.4.0 -tornado==6.0.3 -tqdm==4.40.2 -traitlets==4.3.3 -urllib3==1.25.7 -wcwidth==0.1.7 -webencodings==0.5.1 -widgetsnbextension==3.5.1 -zipp==0.6.0 +attrs +backcall +bleach +certifi +cffi +chardet +ConfigArgParse +cycler +datasets +decorator +defusedxml +entrypoints +idna +importlib-metadata +ipykernel +ipython +ipython-genutils +ipywidgets +jedi +Jinja2 +json5 +jsonschema +jupyter +jupyter-client +jupyter-console +jupyter-core +jupyterlab +jupyterlab-server +kiwisolver +lab +MarkupSafe +matplotlib +mistune +more-itertools +nbconvert +nbformat +notebook +numpy +pandocfilters +parso +pexpect +pickleshare +prometheus-client +prompt-toolkit +ptyprocess +pycparser +Pygments +pyparsing +pyrsistent +python-dateutil +PyYAML +pyzmq +qtconsole +requests +Send2Trash +simplejson +six +terminado +testpath +torch +torchtext +tornado +tqdm +traitlets +urllib3 +wcwidth +webencodings +widgetsnbextension +zipp diff --git a/train.cfg b/train.cfg index 91f494f..b516fed 100644 --- a/train.cfg +++ b/train.cfg @@ -56,12 +56,12 @@ data: experiments/exp-1/data/data # path to datafile from preprocess.py save_model: experiments/exp-1/models/model # path to store checkpoints log_file: experiments/exp-1/train-log.txt -report_every: 50 # log current loss every X steps +report_every: 10 # log current loss every X steps save_checkpoint_steps: 500 # save a cp every X steps # Gpu related: -gpu_ranks: [0] # ids of gpus to use +gpu_ranks: [] # ids of gpus to use world_size: 1 # total number of distributed processes gpu_backend: nccl # type of torch distributed backend gpu_verbose_level: 0 @@ -77,7 +77,7 @@ normalization: sents accum_count: [2] # Update weights every X batches accum_steps: [0] # steps at which accum counts value changes valid_steps: 500 # run models on validation set every X steps -train_steps: 30000 +train_steps: 100 optim: adam max_grad_norm: 5 dropout: .5