Working open-source distributed training

Ryan Sepassi · Ryan Sepassi · commit 56d65f0b301f · 2017-07-14T16:40:14.000-07:00
PiperOrigin-RevId: 161731856
diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs
@@ -17,13 +17,13 @@
 
 Usage:
 
-`t2t-make-tf-configs --workers="server1:1234" --ps="server3:2134,server4:2334"`
+`t2t-make-tf-configs --masters="server1:1234" --ps="server3:2134,server4:2334"`
 
-Outputs 1 line per job to stdout, first the workers, then the parameter servers.
+Outputs 1 line per job to stdout, first the masters, then the parameter servers.
 Each line has the TF_CONFIG, then a tab, then the command line flags for that
 job.
 
-If there is a single worker, workers will have the `--sync` flag.
+If there is a single master, it will have the `--sync` flag.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -38,31 +38,32 @@ import tensorflow as tf
 flags = tf.flags
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string("workers", "", "Comma-separated list of worker addresses")
+flags.DEFINE_string("masters", "", "Comma-separated list of master addresses")
 flags.DEFINE_string("ps", "", "Comma-separated list of ps addresses")
 
 
 def main(_):
-  if not (FLAGS.workers and FLAGS.ps):
-    raise ValueError("Must provide --workers and --ps")
+  if not (FLAGS.masters and FLAGS.ps):
+    raise ValueError("Must provide --masters and --ps")
 
-  workers = FLAGS.workers.split(",")
+  masters = FLAGS.masters.split(",")
   ps = FLAGS.ps.split(",")
 
-  cluster = {"ps": ps, "worker": workers}
+  cluster = {"ps": ps, "master": masters}
 
-  for task_type, jobs in (("worker", workers), ("ps", ps)):
+  for task_type, jobs in (("master", masters), ("ps", ps)):
     for idx, job in enumerate(jobs):
-      if task_type == "worker":
+      if task_type == "master":
         cmd_line_flags = " ".join([
             "--master=grpc://%s" % job,
             "--ps_replicas=%d" % len(ps),
-            "--worker_replicas=%d" % len(workers),
+            "--worker_replicas=%d" % len(masters),
             "--worker_gpu=1",
             "--worker_id=%d" % idx,
+            "--worker_job='/job:master'",
             "--ps_gpu=1",
             "--schedule=train",
-            "--sync" if len(workers) == 1 else "",
+            "--sync" if len(masters) == 1 else "",
         ])
       else:
         cmd_line_flags = " ".join([
diff --git a/tensor2tensor/docs/distributed_training.md b/tensor2tensor/docs/distributed_training.md
@@ -10,52 +10,54 @@ along with a set of flags.
 
 ## `TF_CONFIG`
 
-Both workers and parameter servers must have the `TF_CONFIG` environment
+Both masters and parameter servers must have the `TF_CONFIG` environment
 variable set.
 
 The `TF_CONFIG` environment variable is a json-encoded string with the addresses
-of the workers and parameter servers (in the `'cluster'` key) and the
+of the masters and parameter servers (in the `'cluster'` key) and the
 identification of the current task (in the `'task'` key).
 
 For example:
 
 ```
 cluster = {
     'ps': ['host1:2222', 'host2:2222'],
-    'worker': ['host3:2222', 'host4:2222', 'host5:2222']
+    'master': ['host3:2222', 'host4:2222', 'host5:2222']
 }
 os.environ['TF_CONFIG'] = json.dumps({
     'cluster': cluster,
-    'task': {'type': 'worker', 'index': 1}
+    'task': {'type': 'master', 'index': 1},
+    'environment': 'cloud',
 })
 ```
 
 ## Command-line flags
 
-The following T2T command-line flags must also be set on the workers for
+The following T2T command-line flags must also be set on the masters for
 distributed training:
 
 - `--master=grpc://$ADDRESS`
-- `--worker_replicas=$NUM_WORKERS`
-- `--worker_gpu=$NUM_GPUS_PER_WORKER`
-- `--worker_id=$WORKER_ID`
+- `--worker_replicas=$NUM_MASTERS`
+- `--worker_gpu=$NUM_GPUS_PER_MASTER`
+- `--worker_id=$MASTER_ID`
+- `--worker_job='/job:master'`
 - `--ps_replicas=$NUM_PS`
 - `--ps_gpu=$NUM_GPUS_PER_PS`
 - `--schedule=train`
 - `--sync`, if you want synchronous training, i.e. for there to be a single
-  master worker coordinating the work across "ps" jobs (yes, the naming is
-  unfortunate). If not set, then each worker operates independently while
-  variables are shared on the parameter servers.
+  master coordinating the work across "ps" jobs. If not set, then each master
+  operates independently while variables are shared on the parameter servers.
 
-Parameter servers only need `--schedule=run_std_server`.
+Parameter servers only need `--master=grpc://$ADDRESS` and
+`--schedule=run_std_server`.
 
 ## Utility to produce `TF_CONFIG` and flags
 
 [`t2t-make-tf-configs`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t-make-tf-configs))
 generates the `TF_CONFIG` json strings and the above-mentioned command-line
-flags for the workers and parameter servers.
+flags for the masters and parameter servers.
 
-Given a set of worker and parameter server addresses, the script outputs, for
+Given a set of master and parameter server addresses, the script outputs, for
 each job, a line with the `TF_CONFIG` environment variable and the command-line
 flags necessary for distributed training. For each job, you should invoke the
 `t2t-trainer` with the `TF_CONFIG` value and flags that are output.
@@ -66,6 +68,9 @@ For example:
 TF_CONFIG=$JOB_TF_CONFIG t2t-trainer $JOB_FLAGS --model=transformer ...
 ```
 
+Modify the `--worker_gpu` and `--ps_gpu` flags, which specify how many gpus are
+on each master and ps, respectively, as needed for your machine/cluster setup.
+
 ## Command-line flags for eval jobs
 
 Eval jobs should set the following flags and do not need the `TF_CONFIG`
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
@@ -91,6 +91,8 @@
 flags.DEFINE_integer("worker_gpu", 1, "How many GPUs to use.")
 flags.DEFINE_integer("worker_replicas", 1, "How many workers to use.")
 flags.DEFINE_integer("worker_id", 0, "Which worker task are we.")
+flags.DEFINE_float("worker_gpu_memory_fraction", 1.,
+                   "Fraction of GPU memory to allocate.")
 flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.")
 flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus."
                     " e.g. \"1 3 2 4\"")
@@ -177,6 +179,7 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
       config=tf.contrib.learn.RunConfig(
           master=FLAGS.master,
           model_dir=output_dir,
+          gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction,
           session_config=session_config(),
           keep_checkpoint_max=FLAGS.keep_checkpoint_max))
   # Store the hparams in the estimator as well
@@ -270,16 +273,21 @@ def session_config():
   """The TensorFlow Session config to use."""
   graph_options = tf.GraphOptions(optimizer_options=tf.OptimizerOptions(
       opt_level=tf.OptimizerOptions.L1, do_function_inlining=False))
+
   if FLAGS.experimental_optimize_placement:
     rewrite_options = tf.RewriterConfig(optimize_tensor_layout=True)
     rewrite_options.optimizers.append("pruning")
     rewrite_options.optimizers.append("constfold")
     rewrite_options.optimizers.append("layout")
     graph_options = tf.GraphOptions(
         rewrite_options=rewrite_options, infer_shapes=True)
-  config = tf.ConfigProto(
-      allow_soft_placement=True, graph_options=graph_options)
 
+  gpu_options = tf.GPUOptions(
+      per_process_gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction)
+
+  config = tf.ConfigProto(allow_soft_placement=True,
+                          graph_options=graph_options,
+                          gpu_options=gpu_options)
   return config