tensorflow
diff --git a/‎tensor2tensor/data_generators/all_problems.py‎
Lines changed: 1 addition & 1 deletion b/‎tensor2tensor/data_generators/all_problems.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensor2tensor/data_generators/genetics.py‎ renamed to ‎tensor2tensor/data_generators/gene_expression.py‎
Lines changed: 23 additions & 16 deletions b/‎tensor2tensor/data_generators/genetics.py‎ renamed to ‎tensor2tensor/data_generators/gene_expression.py‎
Lines changed: 23 additions & 16 deletions
diff --git a/‎tensor2tensor/data_generators/genetics_test.py‎ renamed to ‎tensor2tensor/data_generators/gene_expression_test.py‎
Lines changed: 4 additions & 4 deletions b/‎tensor2tensor/data_generators/genetics_test.py‎ renamed to ‎tensor2tensor/data_generators/gene_expression_test.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tensor2tensor/models/common_layers.py‎
Lines changed: 17 additions & 14 deletions b/‎tensor2tensor/models/common_layers.py‎
Lines changed: 17 additions & 14 deletions
diff --git a/‎tensor2tensor/models/gene_expression.py‎
Lines changed: 132 additions & 0 deletions b/‎tensor2tensor/models/gene_expression.py‎
Lines changed: 132 additions & 0 deletions
@@ -34,7 +34,7 @@
 # pylint: disable=g-import-not-at-top
 try:
   # Requires h5py
-  from tensor2tensor.data_generators import genetics
+  from tensor2tensor.data_generators import gene_expression
 except ImportError:
   pass
 # pylint: enable=g-import-not-at-top
 
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Genetics problems.
+"""Gene expression problems.
 
 Inputs are bases ACTG (with indices assigned in that order).
 
@@ -82,7 +82,7 @@ def chunk_size(self):
   def feature_encoders(self, data_dir):
     del data_dir
     return {
-        "inputs": GeneticBaseEncoder(chunk_size=self.chunk_size),
+        "inputs": DNAEncoder(chunk_size=self.chunk_size),
         # TODO(rsepassi): RealEncoder?
         "targets": text_encoder.TextEncoder()
     }
@@ -166,17 +166,24 @@ def example_reading_spec(self):
   def preprocess_examples(self, examples, mode):
     del mode
 
+    # Reshape targets
     examples["targets"] = tf.reshape(examples["targets"],
                                      [-1, 1, self.num_output_predictions])
+    examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1])
+
+    # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them.
+    # Add epsilon because some unmasked labels are actually 0.
+    examples["targets"] += 1e-6
+    examples["targets"] *= examples["targets_mask"]
 
     return examples
 
   def eval_metrics(self):
     return [metrics.Metrics.RMSE]
 
 
-@registry.register_problem("genetics_cage10")
-class GeneticsCAGE10(GeneExpressionProblem):
+@registry.register_problem("gene_expression_cage10")
+class GeneExpressionCAGE10(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -187,8 +194,8 @@ def h5_file(self):
     return "cage10.h5"
 
 
-@registry.register_problem("genetics_gm12878")
-class GeneticsGM12878(GeneExpressionProblem):
+@registry.register_problem("gene_expression_gm12878")
+class GeneExpressionGM12878(GeneExpressionProblem):
 
   @property
   def download_url(self):
@@ -199,8 +206,8 @@ def h5_file(self):
     return "gm12878.h5"
 
 
-@registry.register_problem("genetics_l262k")
-class GeneticsL262k(GeneExpressionProblem):
+@registry.register_problem("gene_expression_l262k")
+class GeneExpressionL262k(GeneExpressionProblem):
 
   @property
   def h5_file(self):
@@ -236,7 +243,7 @@ def dataset_generator(filepath,
                       chunk_size=1,
                       start_idx=None,
                       end_idx=None):
-  encoder = GeneticBaseEncoder(chunk_size=chunk_size)
+  encoder = DNAEncoder(chunk_size=chunk_size)
   with h5py.File(filepath, "r") as h5_file:
     # Get input keys from h5_file
     src_keys = [s % dataset for s in ["%s_in", "%s_na", "%s_out"]]
@@ -291,7 +298,7 @@ def to_example_dict(encoder, inputs, mask, outputs):
   return ex_dict
 
 
-class GeneticBaseEncoder(text_encoder.TextEncoder):
+class DNAEncoder(text_encoder.TextEncoder):
   """ACTG strings to ints and back. Optionally chunks bases into single ids.
 
   Uses 'X' as an unknown base.
@@ -302,14 +309,14 @@ class GeneticBaseEncoder(text_encoder.TextEncoder):
   def __init__(self,
                chunk_size=1,
                num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS):
-    super(GeneticBaseEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
+    super(DNAEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
     # Build a vocabulary of chunks of size chunk_size
     self._chunk_size = chunk_size
     chunks = []
     for size in range(1, chunk_size + 1):
-      c = itertools.product(_bases + [GeneticBaseEncoder.UNK], repeat=size)
+      c = itertools.product(_bases + [DNAEncoder.UNK], repeat=size)
       num_pad = chunk_size - size
-      padding = (GeneticBaseEncoder.PAD,) * num_pad
+      padding = (DNAEncoder.PAD,) * num_pad
       c = [el + padding for el in c]
       chunks.extend(c)
     chunks.sort()
@@ -323,7 +330,7 @@ def vocab_size(self):
 
   def encode(self, s):
     bases = list(s)
-    pad = [GeneticBaseEncoder.PAD] * (len(bases) % self._chunk_size)
+    pad = [DNAEncoder.PAD] * (len(bases) % self._chunk_size)
     bases.extend(pad)
     assert (len(bases) % self._chunk_size) == 0
     num_chunks = len(bases) // self._chunk_size
@@ -342,8 +349,8 @@ def decode(self, ids):
     for idx in ids:
       if idx >= self._num_reserved_ids:
         chunk = self._ids_to_chunk[idx]
-        if GeneticBaseEncoder.PAD in chunk:
-          chunk = chunk[:chunk.index(GeneticBaseEncoder.PAD)]
+        if DNAEncoder.PAD in chunk:
+          chunk = chunk[:chunk.index(DNAEncoder.PAD)]
       else:
         chunk = [text_encoder.RESERVED_TOKENS[idx]]
       bases.extend(chunk)
 
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from tensor2tensor.data_generators import genetics
+from tensor2tensor.data_generators import gene_expression
 
 import tensorflow as tf
 
@@ -40,15 +40,15 @@ def _oneHotBases(self, bases):
     return np.array(one_hots)
 
   def testRecordToExample(self):
-    encoder = genetics.GeneticBaseEncoder(chunk_size=2)
+    encoder = gene_expression.DNAEncoder(chunk_size=2)
     raw_inputs = ["A", "C", "G", "X", "C", "T"]
 
     # Put in numpy arrays in the same format as in the h5 file
     inputs = self._oneHotBases(raw_inputs)
     mask = np.array([True, False, True])
     outputs = np.array([[1.0, 2.0, 3.0], [5.0, 1.0, 0.2], [5.1, 2.3, 2.3]])
     # Convert to example dict
-    ex_dict = genetics.to_example_dict(encoder, inputs, mask, outputs)
+    ex_dict = gene_expression.to_example_dict(encoder, inputs, mask, outputs)
 
     self.assertEqual(len(raw_inputs) // 2 + 1, len(ex_dict["inputs"]))
     self.assertAllEqual(encoder.encode(raw_inputs) + [1], ex_dict["inputs"])
@@ -61,7 +61,7 @@ def testGenerateShardArgs(self):
     num_examples = 37
     num_shards = 4
     outfiles = [str(i) for i in range(num_shards)]
-    shard_args = genetics.generate_shard_args(outfiles, num_examples)
+    shard_args = gene_expression.generate_shard_args(outfiles, num_examples)
 
     starts, ends, fnames = zip(*shard_args)
     self.assertAllEqual([0, 9, 18, 27], starts)
 
@@ -469,7 +469,10 @@ def get_norm(norm_type):
                    "'noam', 'none'.")
 
 
-def residual_fn(x, y, norm_type, residual_dropout,
+def residual_fn(x,
+                y,
+                norm_type,
+                residual_dropout,
                 filters=None,
                 epsilon=1e-16,
                 name="residual"):
@@ -559,11 +562,17 @@ def conv_block_internal(conv_fn,
 
 
 def conv_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs):
-  """A block of standard convolutions."""
+  """A block of standard 2d convolutions."""
   return conv_block_internal(conv, inputs, filters,
                              dilation_rates_and_kernel_sizes, **kwargs)
 
 
+def conv1d_block(inputs, filters, dilation_rates_and_kernel_sizes, **kwargs):
+  """A block of standard 1d convolutions."""
+  return conv_block_internal(conv1d, inputs, filters,
+                             dilation_rates_and_kernel_sizes, **kwargs)
+
+
 def separable_conv_block(inputs, filters, dilation_rates_and_kernel_sizes,
                          **kwargs):
   """A block of separable convolutions."""
@@ -858,10 +867,7 @@ def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes,
     return tf.add_n(results) * (len(results)**-0.5)
 
 
-def multiscale_conv_and_attention(x,
-                                  padding,
-                                  hparams,
-                                  source=None):
+def multiscale_conv_and_attention(x, padding, hparams, source=None):
   """A common part of t2t layers.
 
   First, do a linear multiscale convolution
@@ -925,10 +931,7 @@ def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type,
     return tf.add_n(results) * (len(results)**-0.5)
 
 
-def conv_with_pools_and_attention(x,
-                                  padding,
-                                  hparams,
-                                  source=None):
+def conv_with_pools_and_attention(x, padding, hparams, source=None):
   """A common part of t2t layers.
 
   First, do conv_with_pools
@@ -1389,8 +1392,8 @@ def padded_cross_entropy(logits,
   vocab_size = tf.shape(logits)[-1]
   with tf.name_scope("padded_cross_entropy", [logits, labels]):
     pad_logits, pad_labels = pad_with_zeros(logits, labels)
-    xent = smoothing_cross_entropy(pad_logits, pad_labels,
-                                   vocab_size, confidence)
+    xent = smoothing_cross_entropy(pad_logits, pad_labels, vocab_size,
+                                   confidence)
     weights = weights_fn(pad_labels)
     if not reduce_sum:
       return xent * weights, weights
@@ -1493,8 +1496,8 @@ def linear_set_layer(layer_size,
       # Unfortunately tf doesn't support broadcasting via concat, but we can
       # simply add the transformed context to get the same effect.
       context = tf.expand_dims(context, axis=1)
-      cont_tfm = conv1d(context, layer_size, 1,
-                        activation=None, name="cont_conv")
+      cont_tfm = conv1d(
+          context, layer_size, 1, activation=None, name="cont_conv")
       outputs += cont_tfm
 
     if activation_fn is not None:
 
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Models for gene expression from DNA."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_hparams
+from tensor2tensor.models import common_layers
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class GeneExpressionConv(t2t_model.T2TModel):
+  """Gene expression conv net.
+
+  Based on "Basenji" model from
+  http://www.biorxiv.org/content/early/2017/07/10/161851
+
+  Uses layer_norm instead of batch_norm.
+  """
+
+  def model_fn_body(self, features):
+    inputs = features["inputs"]
+    inputs.get_shape().assert_has_rank(4)
+
+    hp = self._hparams
+
+    out = inputs
+    out = common_layers.flatten4d3d(out)
+
+    # Conv layers
+    for i in xrange(hp.num_conv_layers):
+      out = conv_layer(
+          out,
+          hp.hidden_size,
+          hp.kernel_width,
+          hp.stride,
+          hp.pooling_windows[i],
+          hp.dropout,
+          1,
+          name="conv_%d" % (i + 1))
+
+    # Dense dilated conv layers
+    for i in xrange(hp.num_dconv_layers):
+      dilation_rate = 2**(i + 1)
+      dconv_out = conv_layer(
+          out,
+          hp.hidden_size,
+          hp.kernel_width,
+          1,
+          0,
+          hp.dropout,
+          dilation_rate,
+          name="dconv_%d" % (i + 1))
+      out = tf.concat([out, dconv_out], axis=2)
+
+    # Fully connected layer
+    out = fc_layer(out, hp.hidden_size, hp.dropout, name="fc")
+
+    out.get_shape().assert_has_rank(3)
+    out = tf.expand_dims(out, 2)
+    return out
+
+
+def conv_layer(x,
+               hidden_size,
+               kernel_size,
+               stride,
+               pooling_window,
+               dropout_rate,
+               dilation_rate,
+               name="conv"):
+  with tf.variable_scope(name):
+    out = x
+    out = common_layers.conv1d_block(
+        out,
+        hidden_size, [(dilation_rate, kernel_size)],
+        strides=stride,
+        first_relu=False,
+        padding="same")
+    out = tf.nn.relu(out)
+    if pooling_window:
+      out = tf.layers.max_pooling1d(
+          out, pooling_window, pooling_window, padding="same")
+    out = tf.layers.dropout(out, dropout_rate)
+    return out
+
+
+def fc_layer(x, num_out, dropout_rate, name="fc"):
+  with tf.variable_scope(name):
+    out = x
+    out = tf.layers.dense(out, num_out)
+    out = tf.contrib.layers.layer_norm(out)
+    out = tf.nn.relu(out)
+    out = tf.layers.dropout(out, dropout_rate)
+    return out
+
+
+@registry.register_hparams
+def gene_expression_conv_base():
+  """Hparams for GeneExpressionConv model."""
+  hparams = common_hparams.basic_params1()
+  hparams.add_hparam("num_conv_layers", 4)
+  hparams.add_hparam("num_dconv_layers", 7)
+  hparams.add_hparam("pooling_windows", [2, 4, 4, 4])
+
+  # TODO(rsepassi): Correct the values of these hyperparameters
+  hparams.hidden_size = 128
+  hparams.kernel_width = 128
+  hparams.add_hparam("stride", 1)
+  return hparams