Merge pull request #40 from r9y9/fix-non-deterministic

r9y9 · web-flow · commit 1be4f8eabb77 · 2018-02-07T00:44:09.000+09:00
Fix non deterministic incremental inference
diff --git a/deepvoice3_pytorch/deepvoice3.py b/deepvoice3_pytorch/deepvoice3.py
@@ -487,8 +487,17 @@ def incremental_forward(self, encoder_out, text_positions, speaker_embed=None,
         return outputs, alignments, dones, decoder_states
 
     def start_fresh_sequence(self):
-        for conv in self.convolutions:
-            conv.clear_buffer()
+        _clear_modules(self.preattention)
+        _clear_modules(self.convolutions)
+        self.last_conv.clear_buffer()
+
+
+def _clear_modules(modules):
+    for m in modules:
+        try:
+            m.clear_buffer()
+        except AttributeError as e:
+            pass
 
 
 class Converter(nn.Module):
diff --git a/deepvoice3_pytorch/nyanko.py b/deepvoice3_pytorch/nyanko.py
@@ -342,6 +342,7 @@ def incremental_forward(self, encoder_out, text_positions,
     def start_fresh_sequence(self):
         _clear_modules(self.audio_encoder_modules)
         _clear_modules(self.audio_decoder_modules)
+        self.last_conv.clear_buffer()
 
 
 def _clear_modules(modules):
diff --git a/tests/data/ljspeech-mel-00001.npy b/tests/data/ljspeech-mel-00001.npy
diff --git a/tests/test_deepvoice3.py b/tests/test_deepvoice3.py
@@ -18,6 +18,7 @@
 
 
 use_cuda = torch.cuda.is_available() and False
+torch.backends.cudnn.deterministic = True
 num_mels = 80
 num_freq = 513
 outputs_per_step = 4
@@ -145,13 +146,45 @@ def test_multi_speaker_deepvoice3():
     print("Done:", done.size())
 
 
-@attr("local_only")
+@attr("issue38")
+def test_incremental_path_multiple_times():
+    texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
+    seqs = np.array([text_to_sequence(t) for t in texts])
+    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
+
+    r = 4
+    mel_dim = 80
+    sequence = Variable(torch.LongTensor(seqs))
+    text_positions = Variable(torch.LongTensor(text_positions))
+
+    for model, speaker_ids in [
+            (_get_model(force_monotonic_attention=False), None),
+            (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]:
+        model.eval()
+
+        # first call
+        mel_outputs, linear_outputs, alignments, done = model(
+            sequence, text_positions=text_positions, speaker_ids=speaker_ids)
+
+        # second call
+        mel_outputs2, linear_outputs2, alignments2, done2 = model(
+            sequence, text_positions=text_positions, speaker_ids=speaker_ids)
+
+        # Should get same result
+        c = (mel_outputs - mel_outputs2).abs()
+        print(c.mean(), c.max())
+
+        assert np.allclose(mel_outputs.cpu().data.numpy(),
+                           mel_outputs2.cpu().data.numpy(), atol=1e-5)
+
+
 def test_incremental_correctness():
     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
     seqs = np.array([text_to_sequence(t) for t in texts])
     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
 
-    mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
+    mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
+    mel = np.load(mel_path)
     max_target_len = mel.shape[0]
     r = 4
     mel_dim = 80
diff --git a/tests/test_nyanko.py b/tests/test_nyanko.py
@@ -16,7 +16,7 @@
 from deepvoice3_pytorch.builder import nyanko
 from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
 
-use_cuda = torch.cuda.is_available()
+use_cuda = torch.cuda.is_available() and False
 num_mels = 80
 num_freq = 513
 outputs_per_step = 4
@@ -57,13 +57,45 @@ def test_nyanko_basics():
         mel_outputs, linear_outputs, alignments, done = model(x, y)
 
 
-@attr("local_only")
+@attr("issue38")
+def test_incremental_path_multiple_times():
+    texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
+    seqs = np.array([text_to_sequence(t) for t in texts])
+    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
+
+    r = 1
+    mel_dim = 80
+
+    sequence = Variable(torch.LongTensor(seqs))
+    text_positions = Variable(torch.LongTensor(text_positions))
+
+    model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
+                   r=r, force_monotonic_attention=False)
+    model.eval()
+
+    # first call
+    mel_outputs, linear_outputs, alignments, done = model(
+        sequence, text_positions=text_positions, speaker_ids=None)
+
+    # second call
+    mel_outputs2, linear_outputs2, alignments2, done2 = model(
+        sequence, text_positions=text_positions, speaker_ids=None)
+
+    # Should get same result
+    c = (mel_outputs - mel_outputs2).abs()
+    print(c.mean(), c.max())
+
+    assert np.allclose(mel_outputs.cpu().data.numpy(),
+                       mel_outputs2.cpu().data.numpy(), atol=1e-5)
+
+
 def test_incremental_correctness():
     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
     seqs = np.array([text_to_sequence(t) for t in texts])
     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
 
-    mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
+    mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
+    mel = np.load(mel_path)[::4]
     max_target_len = mel.shape[0]
     r = 1
     mel_dim = 80