graphcore
diff --git a/‎tensorflow/compiler/plugin/poplar/BUILD‎
Lines changed: 7 additions & 2 deletions b/‎tensorflow/compiler/plugin/poplar/BUILD‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/kernels/application_runtime/application_compile.cc‎
Lines changed: 170 additions & 0 deletions b/‎tensorflow/compiler/plugin/poplar/kernels/application_runtime/application_compile.cc‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/kernels/datastream/application_runtime.cc‎ renamed to ‎tensorflow/compiler/plugin/poplar/kernels/application_runtime/application_runtime.cc‎ b/‎tensorflow/compiler/plugin/poplar/kernels/datastream/application_runtime.cc‎ renamed to ‎tensorflow/compiler/plugin/poplar/kernels/application_runtime/application_runtime.cc‎
diff --git a/‎tensorflow/compiler/plugin/poplar/ops/application_runtime/application_compile.cc‎
Lines changed: 31 additions & 0 deletions b/‎tensorflow/compiler/plugin/poplar/ops/application_runtime/application_compile.cc‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎tensorflow/compiler/plugin/poplar/ops/datastream/application_runtime.cc‎ renamed to ‎tensorflow/compiler/plugin/poplar/ops/application_runtime/application_runtime.cc‎ b/‎tensorflow/compiler/plugin/poplar/ops/datastream/application_runtime.cc‎ renamed to ‎tensorflow/compiler/plugin/poplar/ops/application_runtime/application_runtime.cc‎
diff --git a/‎tensorflow/python/framework/func_graph.py‎
Lines changed: 4 additions & 1 deletion b/‎tensorflow/python/framework/func_graph.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tensorflow/python/ipu/BUILD‎
Lines changed: 15 additions & 0 deletions b/‎tensorflow/python/ipu/BUILD‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tensorflow/python/ipu/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/python/ipu/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -956,7 +956,8 @@ cc_library(
 cc_library(
     name = "kernels",
     srcs = [
-        "kernels/datastream/application_runtime.cc",
+        "kernels/application_runtime/application_compile.cc",
+        "kernels/application_runtime/application_runtime.cc",
         "kernels/datastream/dataset_benchmark.cc",
         "kernels/datastream/feeds.cc",
         "kernels/datastream/host_embedding.cc",
@@ -1016,6 +1017,7 @@ cc_library(
         ":driver",
         ":xla_util",
         "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/plugin/poplar/kernels/dataset:kernels",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -1372,7 +1374,10 @@ tf_custom_op_py_library(
 
 cc_library(
     name = "application_runtime",
-    srcs = ["ops/datastream/application_runtime.cc"],
+    srcs = [
+        "ops/application_runtime/application_compile.cc",
+        "ops/application_runtime/application_runtime.cc",
+    ],
     deps = [
         "//tensorflow/core:framework",
     ],
 
@@ -0,0 +1,170 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/jit/kernels/xla_ops.h"
+#include "tensorflow/compiler/plugin/poplar/driver/poplar_executable.h"
+#include "tensorflow/compiler/plugin/poplar/driver/poplar_platform.h"
+#include "tensorflow/compiler/plugin/poplar/kernels/ipu_kernels_common.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status BuildCompilationCache(OpKernelContext* ctx, se::Platform* platform,
+                             XlaCompilationCache** out_cache) {
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform);
+  client_options.set_intra_op_parallelism_threads(
+      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
+  TF_ASSIGN_OR_RETURN(
+      auto* client, xla::ClientLibrary::GetOrCreateLocalClient(client_options));
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice("IPU", &registration)) {
+    return errors::InvalidArgument("No JIT device registered for IPU");
+  }
+
+  *out_cache = new XlaCompilationCache(
+      client, DeviceType(registration->compilation_device_name));
+  return Status::OK();
+}
+
+xla::StatusOr<xla::LocalExecutable*> CompileExecutable(
+    OpKernelContext* ctx, const NameAttrList& function, se::Platform* platform,
+    absl::Span<const Tensor* const> inputs,
+    absl::Span<const VariableInfo> variable_infos,
+    absl::Span<const int> constants) {
+  auto* resource_manager = ctx->resource_manager();
+  if (!resource_manager) {
+    return errors::Internal("Resource manager not found");
+  }
+
+  XlaCompilationCache* cache;
+  TF_RETURN_IF_ERROR(resource_manager->LookupOrCreate<XlaCompilationCache>(
+      resource_manager->default_container(), "ipu_application_compile_cache",
+      &cache, [&](XlaCompilationCache** cache) {
+        return BuildCompilationCache(ctx, platform, cache);
+      }));
+  core::ScopedUnref cache_ref(cache);
+
+  const auto* function_library = ctx->function_library();
+  if (!function_library) {
+    return errors::Internal("Function library not found");
+  }
+
+  const auto* flib_def = function_library->GetFunctionLibraryDefinition();
+  const auto* func_def = CHECK_NOTNULL(flib_def)->Find(function.name());
+  if (!func_def) {
+    return errors::Internal("Function not found: " + function.name());
+  }
+
+  VLOG(1) << "Compiling function: " << DebugString(*func_def);
+
+  XlaCompiler::Options options;
+  options.client = cache->client();
+  options.device_type = cache->device_type();
+  options.flib_def = flib_def;
+  options.graph_def_version = function_library->graph_def_version();
+
+  se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
+                                              platform);
+  options.device_allocator = &tf_allocator_adapter;
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
+  compile_options.always_return_tuple = false;
+
+  // IPU Specific - store the names of all inputs.
+  std::vector<std::string> mangled_input_names(inputs.size());
+  for (int64 i = 0; i != inputs.size(); ++i) {
+    mangled_input_names[i] = ctx->op_kernel().requested_input(i);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<XlaCompiler::Argument> arguments,
+      XlaComputationLaunchContext::BuildXlaCompilerArguments(
+          constants, inputs, variable_infos, mangled_input_names));
+
+  const XlaCompiler::CompilationResult* compilation_result;
+  xla::LocalExecutable* executable;
+  TF_RETURN_IF_ERROR(cache->Compile(options, function, arguments,
+                                    compile_options,
+                                    XlaCompilationCache::CompileMode::kStrict,
+                                    &compilation_result, &executable));
+  return executable;
+}
+
+}  // namespace
+
+class IPUApplicationCompile : public OpKernel {
+ public:
+  explicit IPUApplicationCompile(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &function_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("resource_indices", &resource_indices_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("constant_indices", &constant_indices_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("executable_output_path", &executable_output_path_));
+  }
+
+  void Compute(OpKernelContext* ctx) {
+    auto platform_or_status =
+        se::MultiPlatformManager::PlatformWithName("Poplar");
+    OP_REQUIRES_OK(ctx, platform_or_status.status());
+    auto* platform = platform_or_status.ValueOrDie();
+
+    std::vector<const Tensor*> inputs = InputsFromContext(ctx);
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(ctx, GetVariableInfosFromInputs(
+                            ctx->resource_manager(), ctx->device(), inputs,
+                            resource_indices_, &variable_infos));
+
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
+
+    auto executable_or_status = CompileExecutable(
+        ctx, function_, platform, inputs, variable_infos, constant_indices_);
+    OP_REQUIRES_OK(ctx, executable_or_status.status());
+
+    auto* poplar_executable =
+        dynamic_cast<xla::poplarplugin::PoplarExecutable*>(
+            executable_or_status.ValueOrDie()->executable());
+    OP_REQUIRES(ctx, poplar_executable != nullptr,
+                errors::Internal("Missing Poplar executable"));
+
+    OP_REQUIRES_OK(ctx, poplar_executable->Serialize(executable_output_path_));
+    ctx->set_output(0, Tensor(executable_output_path_));
+  }
+
+ private:
+  NameAttrList function_;
+  std::string executable_output_path_;
+  std::vector<int> constant_indices_;
+  std::vector<int> resource_indices_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(IPUApplicationCompile);
+};
+
+// We register the op both for CPU and IPU to make it easier to use, as we then
+// can handle any colocation requirements from variables etc. The function will
+// be compiled for IPU regardless of the device placement of the op itself.
+REGISTER_KERNEL_BUILDER(Name("IPUApplicationCompile").Device(DEVICE_CPU),
+                        IPUApplicationCompile);
+REGISTER_KERNEL_BUILDER(Name("IPUApplicationCompile").Device(DEVICE_XLA_IPU),
+                        IPUApplicationCompile);
+
+}  // namespace tensorflow
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("IPUApplicationCompile")
+    .Input("args: Targs")
+    .Attr("Targs: list(type) >= 0")
+    .Attr("resource_indices: list(int) >= 0")
+    .Attr("constant_indices: list(int) >= 0")
+    .Attr("executable_output_path: string")
+    .Output("output: string")
+    .Attr("function: func")
+    // Compilation cache is stateful.
+    .SetIsStateful();
+
+}  // namespace tensorflow
@@ -509,7 +509,10 @@ def _capture_by_value(
             compat.as_bytes(op_type), 1, uncaptured_inputs, attr_list,
             context.context())
       else:
-        op = ops.get_default_graph()._create_op_internal(  # pylint: disable=protected-access
+        # Make sure the name is unique in the outer graph.
+        outer_graph = ops.get_default_graph()
+        name = outer_graph.unique_name(name)
+        op = outer_graph._create_op_internal(  # pylint: disable=protected-access
             op_type,
             uncaptured_inputs,
             dtypes,
 
@@ -13,6 +13,7 @@ load("@local_config_ipu_horovod//:build_defs_horovod.bzl", "if_horovod", "poprun
 py_library(
     name = "ipu_ops_lib",
     srcs = [
+        "ops/application_compile_op.py",
         "ops/cross_replica_ops.py",
         "ops/embedding_ops.py",
         "ops/functional_ops.py",
@@ -291,11 +292,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "application_compile_test",
+    size = "large",
+    srcs = ["tests/application_compile_test.py"],
+    deps = [
+        "//tensorflow/compiler/plugin/poplar:test_utils_py",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python/ipu:ipu_lib",
+    ],
+)
+
 tf_py_test(
     name = "application_runtime_test",
     size = "large",
     srcs = ["tests/application_runtime_test.py"],
     shard_count = 4,
+    tags = ["hw_poplar_test"],
     deps = [
         "//tensorflow/compiler/plugin/poplar:ipu_ops_py",
         "//tensorflow/compiler/plugin/poplar:test_utils_py",
@@ -1913,6 +1926,8 @@ tf_py_test(
 test_suite(
     name = "all_tests",
     tests = [
+        "application_compile_test",
+        "application_runtime_test",
         "assume_equal_test",
         "candidate_sampler_test",
         "config_test",
 
@@ -20,6 +20,7 @@
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ipu.ops import all_to_all_op
 from tensorflow.python.ipu.ops import all_to_all_op_grad
+from tensorflow.python.ipu.ops import application_compile_op
 from tensorflow.python.ipu.ops import custom_ops
 from tensorflow.python.ipu.ops import cross_replica_ops
 from tensorflow.python.ipu.ops import cross_replica_ops_grad