pytorch
diff --git a/‎.circleci/config.yml‎
Lines changed: 211 additions & 33 deletions b/‎.circleci/config.yml‎
Lines changed: 211 additions & 33 deletions
diff --git a/‎WORKSPACE‎
Lines changed: 4 additions & 4 deletions b/‎WORKSPACE‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/compiler.cpp‎
Lines changed: 67 additions & 16 deletions b/‎core/compiler.cpp‎
Lines changed: 67 additions & 16 deletions
diff --git a/‎core/conversion/converters/impl/reduce.cpp‎
Lines changed: 34 additions & 1 deletion b/‎core/conversion/converters/impl/reduce.cpp‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎core/conversion/converters/impl/select.cpp‎
Lines changed: 24 additions & 1 deletion b/‎core/conversion/converters/impl/select.cpp‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎core/conversion/converters/impl/unary.cpp‎
Lines changed: 15 additions & 0 deletions b/‎core/conversion/converters/impl/unary.cpp‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎core/conversion/converters/impl/unsqueeze.cpp‎
Lines changed: 1 addition & 1 deletion b/‎core/conversion/converters/impl/unsqueeze.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/lowering/lowering.cpp‎
Lines changed: 1 addition & 1 deletion b/‎core/lowering/lowering.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -56,17 +56,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "59b8b5e1954a86d50b79c13f06398d385b200da13e37a08ecf31d3c62e5ca127",
+    sha256 = "8b3b48615169c83c1b643c0efade078ea080b1da598e15fcf01bc59421f3095e",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230219%2Bcu117.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "e260fc7476be89d1650953e8643e9f7363845f5a52de4bab87ac0e619c1f6ad4",
+    sha256 = "aa7fd06079d260ff83c344d043fb84fbd9cf831cf375ed8b5a1b62416817af31",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
+    urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230219%2Bcu117.zip"],
 )
 
 # Download these tarballs manually from the NVIDIA website
 
@@ -138,7 +138,8 @@ partitioning::GraphAndMapping BuildHybridGraph(
     torch::jit::Block* block,
     CompileSpec cfg,
     ir::StaticParams static_params,
-    ir::CollectionTypeMap first_use_types) {
+    ir::CollectionTypeMap first_use_types,
+    bool expect_full_compilation = false) {
   auto convert_info = cfg.convert_info;
   auto partitioning_info = cfg.partitioning_info;
 
@@ -149,17 +150,20 @@ partitioning::GraphAndMapping BuildHybridGraph(
   // TODO: Combine this within partition call
   partitioning::populateInputIValues(&partitioning_ctx);
 
-  partitioning::partition(&partitioning_ctx);
+  partitioning::partition(&partitioning_ctx, expect_full_compilation);
 
   for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) {
     partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second;
+    int num_torch_segments = 0;
+    int num_trt_segments = 0;
 
     for (auto& seg_block : segmented_blocks) {
       LOG_INFO("Block segment:" << seg_block);
       std::ostringstream trt_engine_id;
       trt_engine_id << reinterpret_cast<const int*>(&seg_block);
 
       if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
+        num_trt_segments++;
         auto inputs = seg_block.construct_inputs_spec();
         // update the input ranges for each segments
         convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
@@ -180,8 +184,32 @@ partitioning::GraphAndMapping BuildHybridGraph(
             true);
 
         seg_block.update_graph(temp_g);
+      } else {
+        num_torch_segments++;
+
+        // If full compilation is expected, ensure that all operators in Torch blocks are
+        // for collections processing
+        if (expect_full_compilation) {
+          for (auto torch_node : seg_block.block()->nodes()) {
+            if (partitioning::CollectionNodeKinds.find(torch_node->kind()) == partitioning::CollectionNodeKinds.end()) {
+              TORCHTRT_THROW_ERROR(
+                  "Full compilation specified but node "
+                  << *torch_node
+                  << " is set to run in PyTorch due to either lack of support in TensorRT or graph partitioning rules."
+                  << " Try recompiling with require_full_compilation=False.");
+            }
+          }
+        }
       }
     }
+
+    // If full compilation is expected, cannot have more than 2 Torch segments
+    // (one for preprocessing inputs, one for post-processing outputs) and 1 TRT segment
+    if (expect_full_compilation && !(num_torch_segments <= 2 && num_trt_segments == 1)) {
+      TORCHTRT_THROW_ERROR(
+          "Full compilation was requested but unable to convert all operations to TensorRT."
+          << " Try recompiling with require_full_compilation=False.");
+    }
   }
 
   return partitioning::stitch(&partitioning_ctx, block);
@@ -191,7 +219,8 @@ ir::TypeMap MapInputsAndDetermineDTypes(
     CompileSpec& cfg,
     std::shared_ptr<torch::jit::Graph>& g,
     ir::StaticParams& static_params,
-    ir::CollectionTypeMap& first_use_type_map) {
+    ir::CollectionTypeMap& first_use_type_map,
+    bool requires_collection_handling = false) {
   cfg.convert_info.collection_input_spec_map =
       std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
   cfg.partitioning_info.collection_input_spec_map =
@@ -226,7 +255,7 @@ ir::TypeMap MapInputsAndDetermineDTypes(
             "Cannot infer input type from calcuations in graph for input "
             << in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
         spec[i].dtype = at::kFloat;
-      } else if (spec[i].dtype_is_user_defined && cfg.partitioning_info.enabled) {
+      } else if (spec[i].dtype_is_user_defined && (cfg.partitioning_info.enabled || requires_collection_handling)) {
         if (!est_type_opt[i]) {
           LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
           std::stringstream ss;
@@ -297,6 +326,11 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
   return engine;
 }
 
+bool userRequestedFallback(CompileSpec& cfg) {
+  return cfg.lower_info.forced_fallback_modules.size() != 0 ||
+      cfg.partitioning_info.forced_fallback_operators.size() != 0;
+}
+
 torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
   torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
 
@@ -315,8 +349,17 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       // Infer the type of an input from the weights of the calculation
       auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
 
+      // Determine if the block is convertible/has collection output, and based on the result,
+      // whether full compilation can be expected
+      auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
+      auto outputIsCollection = conversion::OutputIsCollection(g->block());
+      auto requires_collection_handling = (isBlockConvertible && outputIsCollection);
+
+      // Determine whether user specifications necessitate partitioning
+      auto isFallbackRequested = userRequestedFallback(cfg);
+
       // Extract map of IValue to DType
-      auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
+      auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types, requires_collection_handling);
 
       // Check whether any of the input types are Long
       bool user_requested_long = false;
@@ -330,20 +373,28 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
         user_requested_long &= (casts_inserted > 0);
       }
 
-      auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
-      auto outputIsCollection = conversion::OutputIsCollection(g->block());
-      if (cfg.partitioning_info.enabled && !user_requested_long &&
-          (cfg.lower_info.forced_fallback_modules.size() == 0 &&
-           cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) &&
-          !outputIsCollection) {
+      // Partitioning is required if:
+      // 1. User requested some modules/operators fallback
+      // 2. The block (graph) cannot be converted due to operator coverage
+      // 3. The output of the graph is a collection
+      // 4. The user requested a non-TRT data type input
+      auto isPartitioningRequired =
+          (isFallbackRequested || !isBlockConvertible || outputIsCollection || user_requested_long);
+
+      // The user did not require full compilation, but the model can be fully compiled
+      if (cfg.partitioning_info.enabled && !isPartitioningRequired) {
         LOG_INFO("Skipping partitioning since model is fully supported");
       }
 
-      if (cfg.partitioning_info.enabled &&
-          (!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
-             cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) ||
-           outputIsCollection || user_requested_long)) {
-        auto graph_and_mapping = BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types);
+      // The user did not require full compilation, and the model can be fully compiled
+      // or, the user required full compilation but the I/O of the graph use collections
+      if ((cfg.partitioning_info.enabled && isPartitioningRequired) || requires_collection_handling) {
+        // If the model is fully-compilable and the user has specified full compilation, run partitioning
+        // to generate collection-processing code in Torch
+        auto expect_full_compilation = (requires_collection_handling && !cfg.partitioning_info.enabled);
+
+        auto graph_and_mapping =
+            BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types, expect_full_compilation);
         new_g = graph_and_mapping.first;
         // renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
         for (size_t i = 0; i < new_g->inputs().size(); ++i) {
 
@@ -203,7 +203,8 @@ auto reduce_registrations TORCHTRT_UNUSED =
                return true;
              }})
         .pattern(
-            {"aten::min(Tensor self) -> Tensor", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+            {"aten::min(Tensor self) -> Tensor",
+             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto in_tensor = args[0].ITensorOrFreeze(ctx);
                auto in_dims = util::toVec(in_tensor->getDimensions());
 
@@ -216,6 +217,38 @@ auto reduce_registrations TORCHTRT_UNUSED =
                min_layer->setName(util::node_info(n).c_str());
                auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], min_layer->getOutput(0));
 
+               LOG_DEBUG("Output shape: " << out_tensor->getDimensions());
+               return true;
+             }})
+        .pattern(
+            {"aten::any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor",
+             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+               auto in_tensor = args[0].ITensorOrFreeze(ctx);
+               auto in_dims = in_tensor->getDimensions();
+               auto dim = args[1].unwrapToInt();
+               LOG_DEBUG("Dim to reduce (original): " << dim);
+               dim = dim < 0 ? (in_dims.nbDims + dim) : dim;
+               LOG_DEBUG("Dim to reduce (converted): " << dim);
+
+               uint32_t axis_mask = 1 << dim;
+               LOG_DEBUG("Axis Mask: " << std::bitset<32>(axis_mask));
+
+               auto keepdim = args[2].unwrapToBool();
+               LOG_DEBUG("Keep dims: " << keepdim);
+
+               // Reduce does not work on bool inputs
+               if (in_tensor->getType() == nvinfer1::DataType::kBOOL) {
+                 in_tensor =
+                     castITensor(ctx, in_tensor, nvinfer1::DataType::kINT32, (util::node_info(n) + "_in").c_str());
+               }
+               auto sum_layer = ctx->net->addReduce(*in_tensor, nvinfer1::ReduceOperation::kSUM, axis_mask, keepdim);
+
+               TORCHTRT_CHECK(sum_layer, "Unable to create sum layer from node: " << *n);
+
+               sum_layer->setName(util::node_info(n).c_str());
+               auto out_tensor = castITensor(
+                   ctx, sum_layer->getOutput(0), nvinfer1::DataType::kBOOL, (util::node_info(n) + "_out").c_str());
+               out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], out_tensor);
                LOG_DEBUG("Output shape: " << out_tensor->getDimensions());
                return true;
              }});
 
@@ -180,6 +180,29 @@ auto select_registrations TORCHTRT_UNUSED =
                return true;
              }})
         .pattern(
+            {"aten::index_select(Tensor self, int dim, Tensor index) -> Tensor",
+             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+               auto in = args[0].ITensorOrFreeze(ctx);
+               auto maxDim = static_cast<int64_t>(in->getDimensions().nbDims);
+               auto dim = args[1].unwrapToInt();
+               // Handle negative axis by refering to nbDims of input Tensor
+               dim = dim < 0 ? dim + maxDim : dim;
+               auto index = args[2].ITensorOrFreeze(ctx);
+
+               LOG_DEBUG("Gather input dimensions: " << in->getDimensions());
+               LOG_DEBUG("Dimension to select: " << dim);
+               LOG_DEBUG("Index dimensions: " << index->getDimensions());
+
+               auto gather_layer = ctx->net->addGather(*in, *index, dim);
+               TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
+               auto out = gather_layer->getOutput(0);
+               LOG_DEBUG("Gather tensor shape: " << out->getDimensions());
+
+               out = ctx->AssociateValueAndTensor(n->outputs()[0], out);
+               LOG_DEBUG("Output tensor shape: " << out->getDimensions());
+               return true;
+             }})
+        .pattern(
             {"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto in = args[0].ITensor();
@@ -337,7 +360,7 @@ auto select_registrations TORCHTRT_UNUSED =
 
                  // IGatherLayer takes in input tensor, the indices, and the axis of input tensor to take indices
                  // from
-                 auto gather_layer = ctx->net->addGather(*in, *indicesTensor, 0);
+                 auto gather_layer = ctx->net->addGather(*in, *indicesTensor, adv_idx_indices[0]);
                  TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
                  auto gather_out = gather_layer->getOutput(0);
 
 
@@ -34,6 +34,21 @@ auto reciprocal_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns().
        return true;
      }});
 
+auto logical_not_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern(
+    {"aten::logical_not(Tensor self) -> Tensor", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+       auto in = args[0].ITensorOrFreeze(ctx);
+       if (in->getType() != nvinfer1::DataType::kBOOL) {
+         // unary not layer only supports bool inputs
+         in = castITensor(ctx, in, nvinfer1::DataType::kBOOL, util::node_info(n).c_str());
+       }
+       auto unary_layer = ctx->net->addUnary(*in, nvinfer1::UnaryOperation::kNOT);
+       TORCHTRT_CHECK(unary_layer, "Unable to create logical_not layer from node: " << *n);
+       unary_layer->setName(util::node_info(n).c_str());
+       auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], unary_layer->getOutput(0));
+       LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+       return true;
+     }});
+
 #define convert(unary, trt_type)                                                               \
   auto unary##_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern(       \
       {"aten::" #unary "(Tensor self) -> Tensor",                                              \
 
@@ -32,7 +32,7 @@ auto unsqueeze_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().
 
        auto shuffle_layer = ctx->net->addShuffle(*self);
        TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
-       shuffle_layer->setReshapeDimensions(util::unsqueezeDims(self->getDimensions(), dim));
+       shuffle_layer->setReshapeDimensions(util::unsqueezeDims(self->getDimensions(), dim, 1, false));
 
        auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_layer->getOutput(0));
 
 
@@ -32,7 +32,7 @@ int AutocastLongInputs(
     std::string target_device_name) {
   int num_autocasts = 0;
   // For each graph input, determine if it can be autocasted
-  for (int i = 0; i < g->inputs().size(); i++) {
+  for (size_t i = 0; i < g->inputs().size(); i++) {
     auto input = g->inputs()[i];
 
     // Autocasted inputs must be Tensor-type