diff --git a/INSTALL.md b/INSTALL.md
index 6d70d5d..0646dcc 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,14 +1,15 @@
 ## Installation
 
 ### Requirements:
-- PyTorch 1.3 (1.4 may cause some errors.)
+- python >= 3.8
+- PyTorch == 2.5.1
 - torchvision from master
 - cocoapi
 - yacs
 - matplotlib
 - GCC >= 4.9
 - OpenCV
-- CUDA >= 9.2
+- CUDA >= 12.4
 
 
 ### Option 1: Step-by-step installation
@@ -18,18 +19,17 @@
 # for that, check that `which conda`, `which pip` and `which python` points to the
 # right path. From a clean conda env, this is what you need to do
 
-conda create --name MEGA -y python=3.7
+conda create --name MEGA -y python=3.12
 source activate MEGA
 
 # this installs the right pip and dependencies for the fresh python
 conda install ipython pip
 
 # mega and coco api dependencies
-pip install ninja yacs cython matplotlib tqdm opencv-python scipy
+pip install build wheel installer ninja yacs cython matplotlib tqdm opencv-python scipy numpy
 
 # follow PyTorch installation in https://pytorch.org/get-started/locally/
-# we give the instructions for CUDA 10.0
-conda install pytorch=1.3.0 torchvision cudatoolkit=10.0 -c pytorch
+pip install torch torchvision torchaudio
 
 export INSTALL_DIR=$PWD
 
@@ -37,35 +37,32 @@ export INSTALL_DIR=$PWD
 cd $INSTALL_DIR
 git clone https://github.com/cocodataset/cocoapi.git
 cd cocoapi/PythonAPI
-python setup.py build_ext install
+python -m build --wheel --no-isolation
+python -m installer dist/*.whl
 
 # install cityscapesScripts
 cd $INSTALL_DIR
 git clone https://github.com/mcordts/cityscapesScripts.git
 cd cityscapesScripts/
-python setup.py build_ext install
+python -m build --wheel --no-isolation
+python -m installer dist/*.whl
 
 # install apex
 cd $INSTALL_DIR
 git clone https://github.com/NVIDIA/apex.git
 cd apex
-python setup.py install --cuda_ext --cpp_ext
+python -m build --wheel --no-isolation
+python -m installer dist/*.whl
 
 # install PyTorch Detection
 cd $INSTALL_DIR
 git clone https://github.com/Scalsol/mega.pytorch.git
 cd mega.pytorch
-
-# the following will install the lib with
-# symbolic links, so that you can modify
-# the files if you want and won't need to
-# re-build it
-python setup.py build develop
-
-pip install 'pillow<7.0.0'
+python -m build --wheel --no-isolation
+python -m installer dist/*.whl
 
 unset INSTALL_DIR
 
 # or if you are on macOS
 # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop
-```
\ No newline at end of file
+```
diff --git a/demo/predictor.py b/demo/predictor.py
index 7ba2964..f233ded 100644
--- a/demo/predictor.py
+++ b/demo/predictor.py
@@ -608,7 +608,7 @@ def overlay_class_names(self, image, predictions):
             x, y = box[:2]
             s = template.format(label, score)
             cv2.putText(
-                image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2
+                image, s, (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2
             )
 
         return image
diff --git a/mega_core/csrc/cuda/ROIAlign_cuda.cu b/mega_core/csrc/cuda/ROIAlign_cuda.cu
index 1142fb3..61a73ea 100644
--- a/mega_core/csrc/cuda/ROIAlign_cuda.cu
+++ b/mega_core/csrc/cuda/ROIAlign_cuda.cu
@@ -1,10 +1,9 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCAtomics.cuh>
-#include <THC/THCDeviceUtils.cuh>
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/ThrustAllocator.h>
+#include <ATen/cuda/Atomic.cuh>
 
 // TODO make it in a common file
 #define CUDA_1D_KERNEL_LOOP(i, n)                            \
@@ -272,11 +271,11 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
   auto output_size = num_rois * pooled_height * pooled_width * channels;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
+  dim3 grid(std::min(at::ceil_div((long)output_size, 512L), 4096L));
   dim3 block(512);
 
   if (output.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());
     return output;
   }
 
@@ -294,7 +293,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
          rois.contiguous().data<scalar_t>(),
          output.data<scalar_t>());
   });
-  THCudaCheck(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
   return output;
 }
 
@@ -317,12 +316,12 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 grid(std::min(at::ceil_div((long)grad.numel(), 512L), 4096L));
   dim3 block(512);
 
   // handle possibly empty gradients
   if (grad.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());
     return grad_input;
   }
 
@@ -341,6 +340,6 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
          grad_input.data<scalar_t>(),
          rois.contiguous().data<scalar_t>());
   });
-  THCudaCheck(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
 }
diff --git a/mega_core/csrc/cuda/ROIPool_cuda.cu b/mega_core/csrc/cuda/ROIPool_cuda.cu
index 8f072ff..ed1d93b 100644
--- a/mega_core/csrc/cuda/ROIPool_cuda.cu
+++ b/mega_core/csrc/cuda/ROIPool_cuda.cu
@@ -1,11 +1,9 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCAtomics.cuh>
-#include <THC/THCDeviceUtils.cuh>
-
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/ThrustAllocator.h>
+#include <ATen/cuda/Atomic.cuh>
 
 // TODO make it in a common file
 #define CUDA_1D_KERNEL_LOOP(i, n)                            \
@@ -126,11 +124,11 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
+  dim3 grid(std::min(at::ceil_div((long)output_size, 512L), 4096L));
   dim3 block(512);
 
   if (output.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());
     return std::make_tuple(output, argmax);
   }
 
@@ -148,7 +146,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
          output.data<scalar_t>(),
          argmax.data<int>());
   });
-  THCudaCheck(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
   return std::make_tuple(output, argmax);
 }
 
@@ -173,12 +171,12 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 grid(std::min(at::ceil_div((long)grad.numel(), 512L), 4096L));
   dim3 block(512);
 
   // handle possibly empty gradients
   if (grad.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());
     return grad_input;
   }
 
@@ -197,6 +195,6 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
          grad_input.data<scalar_t>(),
          rois.contiguous().data<scalar_t>());
   });
-  THCudaCheck(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
 }
diff --git a/mega_core/csrc/cuda/SigmoidFocalLoss_cuda.cu b/mega_core/csrc/cuda/SigmoidFocalLoss_cuda.cu
index 456a5f2..49e5f0b 100644
--- a/mega_core/csrc/cuda/SigmoidFocalLoss_cuda.cu
+++ b/mega_core/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -4,11 +4,9 @@
 // cyfu@cs.unc.edu
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCAtomics.cuh>
-#include <THC/THCDeviceUtils.cuh>
-
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/ThrustAllocator.h>
+#include <ATen/cuda/Atomic.cuh>
 #include <cfloat>
 
 // TODO make it in a common file
@@ -117,12 +115,12 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
   auto losses_size = num_samples * logits.size(1);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L));
+  dim3 grid(std::min(at::ceil_div((long)losses_size, 512L), 4096L));
   
   dim3 block(512);
 
   if (losses.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());
     return losses;
   }
 
@@ -137,7 +135,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
 	 num_samples,
          losses.data<scalar_t>());
   });
-  THCudaCheck(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
   return losses;   
 }	
 
@@ -162,11 +160,11 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
   auto d_logits_size = num_samples * logits.size(1);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L));
+  dim3 grid(std::min(at::ceil_div((long)d_logits_size, 512L), 4096L));
   dim3 block(512);
 
   if (d_logits.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
+    AT_CUDA_CHECK(cudaGetLastError());
     return d_logits;
   }
 
@@ -183,7 +181,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
          d_logits.data<scalar_t>());
   });
 
-  THCudaCheck(cudaGetLastError());
+  AT_CUDA_CHECK(cudaGetLastError());
   return d_logits;   
 }	
 
diff --git a/mega_core/csrc/cuda/deform_conv_cuda.cu b/mega_core/csrc/cuda/deform_conv_cuda.cu
index 74f7d33..b99f5c6 100644
--- a/mega_core/csrc/cuda/deform_conv_cuda.cu
+++ b/mega_core/csrc/cuda/deform_conv_cuda.cu
@@ -4,8 +4,8 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#include <THC/THC.h>
-#include <THC/THCDeviceUtils.cuh>
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/ThrustAllocator.h>
 
 #include <vector>
 #include <iostream>
@@ -69,26 +69,26 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
                  int padW, int dilationH, int dilationW, int group,
                  int deformable_group) 
 {
-  AT_CHECK(weight.ndimension() == 4,
+  TORCH_CHECK(weight.ndimension() == 4,
            "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
            "but got: %s",
            weight.ndimension());
 
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
 
-  AT_CHECK(kW > 0 && kH > 0,
+  TORCH_CHECK(kW > 0 && kH > 0,
            "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
            kW);
 
-  AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
            "kernel size should be consistent with weight, ",
            "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
            kW, weight.size(2), weight.size(3));
 
-  AT_CHECK(dW > 0 && dH > 0,
+  TORCH_CHECK(dW > 0 && dH > 0,
            "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
 
-  AT_CHECK(
+  TORCH_CHECK(
       dilationW > 0 && dilationH > 0,
       "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
       dilationH, dilationW);
@@ -104,7 +104,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
     dimw++;
   }
 
-  AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
            ndim);
 
   long nInputPlane = weight.size(1) * group;
@@ -116,7 +116,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
   long outputWidth =
       (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
 
-  AT_CHECK(nInputPlane % deformable_group == 0,
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
            "input channels must divide deformable group size");
 
   if (outputWidth < 1 || outputHeight < 1)
@@ -126,27 +126,27 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
         nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
         outputWidth);
 
-  AT_CHECK(input.size(1) == nInputPlane,
+  TORCH_CHECK(input.size(1) == nInputPlane,
            "invalid number of input planes, expected: %d, but got: %d",
            nInputPlane, input.size(1));
 
-  AT_CHECK((inputHeight >= kH && inputWidth >= kW),
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
            "input image is smaller than kernel");
 
-  AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
            "invalid spatial size of offset, expected height: %d width: %d, but "
            "got height: %d width: %d",
            outputHeight, outputWidth, offset.size(2), offset.size(3));
 
-  AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
            "invalid number of channels of offset");
 
   if (gradOutput != NULL) {
-    AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
+    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
              "invalid number of gradOutput planes, expected: %d, but got: %d",
              nOutputPlane, gradOutput->size(dimf));
 
-    AT_CHECK((gradOutput->size(dimh) == outputHeight &&
+    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
               gradOutput->size(dimw) == outputWidth),
              "invalid size of gradOutput, expected height: %d width: %d , but "
              "got height: %d width: %d",
@@ -197,7 +197,7 @@ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
 
   output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                         outputHeight, outputWidth});
@@ -304,7 +304,7 @@ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
   gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
   columns = at::zeros(
       {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@@ -420,7 +420,7 @@ int deform_conv_backward_parameters_cuda(
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
 
   columns = at::zeros(
       {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@@ -501,8 +501,8 @@ void modulated_deform_conv_cuda_forward(
     const int dilation_w, const int group, const int deformable_group,
     const bool with_bias) 
 {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
 
   const int batch = input.size(0);
   const int channels = input.size(1);
@@ -583,8 +583,8 @@ void modulated_deform_conv_cuda_backward(
     int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
     const bool with_bias) 
 {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
 
   const int batch = input.size(0);
   const int channels = input.size(1);
diff --git a/mega_core/csrc/cuda/deform_conv_kernel_cuda.cu b/mega_core/csrc/cuda/deform_conv_kernel_cuda.cu
index b4f8813..ec34349 100644
--- a/mega_core/csrc/cuda/deform_conv_kernel_cuda.cu
+++ b/mega_core/csrc/cuda/deform_conv_kernel_cuda.cu
@@ -62,10 +62,10 @@
 
 
 #include <ATen/ATen.h>
-#include <THC/THCAtomics.cuh>
 #include <stdio.h>
 #include <math.h>
 #include <float.h>
+#include <ATen/cuda/Atomic.cuh>
 
 using namespace at;
 
diff --git a/mega_core/csrc/cuda/deform_pool_cuda.cu b/mega_core/csrc/cuda/deform_pool_cuda.cu
index 71f305a..e99c529 100644
--- a/mega_core/csrc/cuda/deform_pool_cuda.cu
+++ b/mega_core/csrc/cuda/deform_pool_cuda.cu
@@ -8,8 +8,8 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#include <THC/THC.h>
-#include <THC/THCDeviceUtils.cuh>
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/ThrustAllocator.h>
 
 #include <vector>
 #include <iostream>
@@ -39,7 +39,7 @@ void deform_psroi_pooling_cuda_forward(
     const int output_dim, const int group_size, const int pooled_size,
     const int part_size, const int sample_per_part, const float trans_std) 
 {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
 
   const int batch = input.size(0);
   const int channels = input.size(1);
@@ -65,8 +65,8 @@ void deform_psroi_pooling_cuda_backward(
     const int group_size, const int pooled_size, const int part_size,
     const int sample_per_part, const float trans_std) 
 {
-  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
 
   const int batch = input.size(0);
   const int channels = input.size(1);
diff --git a/mega_core/csrc/cuda/deform_pool_kernel_cuda.cu b/mega_core/csrc/cuda/deform_pool_kernel_cuda.cu
index 127899e..6453a9f 100644
--- a/mega_core/csrc/cuda/deform_pool_kernel_cuda.cu
+++ b/mega_core/csrc/cuda/deform_pool_kernel_cuda.cu
@@ -10,7 +10,7 @@
 
 
 #include <ATen/ATen.h>
-#include <THC/THCAtomics.cuh>
+#include <ATen/cuda/Atomic.cuh>
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
@@ -362,4 +362,4 @@ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
   {
     printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
   }
-}
\ No newline at end of file
+}
diff --git a/mega_core/csrc/cuda/nms.cu b/mega_core/csrc/cuda/nms.cu
index 833d852..fdd7d38 100644
--- a/mega_core/csrc/cuda/nms.cu
+++ b/mega_core/csrc/cuda/nms.cu
@@ -1,10 +1,8 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCDeviceUtils.cuh>
-
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/ThrustAllocator.h>
 #include <vector>
 #include <iostream>
 
@@ -61,7 +59,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
         t |= 1ULL << i;
       }
     }
-    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+    const int col_blocks = at::ceil_div(n_boxes, threadsPerBlock);
     dev_mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
@@ -76,20 +74,18 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 
   int boxes_num = boxes.size(0);
 
-  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+  const int col_blocks = at::ceil_div(boxes_num, threadsPerBlock);
 
   scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
 
-  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
-
   unsigned long long* mask_dev = NULL;
-  //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
+  //AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::raw_alloc((void**) &mask_dev,
   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
 
-  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+  mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
 
-  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
-              THCCeilDiv(boxes_num, threadsPerBlock));
+  dim3 blocks(at::ceil_div(boxes_num, threadsPerBlock),
+              at::ceil_div(boxes_num, threadsPerBlock));
   dim3 threads(threadsPerBlock);
   nms_kernel<<<blocks, threads>>>(boxes_num,
                                   nms_overlap_thresh,
@@ -97,7 +93,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
                                   mask_dev);
 
   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  THCudaCheck(cudaMemcpy(&mask_host[0],
+  AT_CUDA_CHECK(cudaMemcpy(&mask_host[0],
                         mask_dev,
                         sizeof(unsigned long long) * boxes_num * col_blocks,
                         cudaMemcpyDeviceToHost));
@@ -122,7 +118,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
     }
   }
 
-  THCudaFree(state, mask_dev);
+  c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
   // TODO improve this part
   return std::get<0>(order_t.index({
                        keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
diff --git a/mega_core/data/datasets/evaluation/cityscapes/eval_instances.py b/mega_core/data/datasets/evaluation/cityscapes/eval_instances.py
index a486926..c504ac9 100644
--- a/mega_core/data/datasets/evaluation/cityscapes/eval_instances.py
+++ b/mega_core/data/datasets/evaluation/cityscapes/eval_instances.py
@@ -449,7 +449,7 @@ def evaluateBoxMatches(matches, args):
 
     # Here we hold the results
     # First dimension is class, second overlap
-    ap = np.zeros((len(minRegionSizes), len(args.instLabels), len(overlaps)), np.float)
+    ap = np.zeros((len(minRegionSizes), len(args.instLabels), len(overlaps)), np.float32)
 
     for dI, minRegionSize in enumerate(minRegionSizes):
         for (oI, overlapTh) in enumerate(overlaps):
@@ -650,7 +650,7 @@ def evaluateMaskMatches(matches, args):
 
     # Here we hold the results
     # First dimension is class, second overlap
-    ap = np.zeros((len(minRegionSizes), len(args.instLabels), len(overlaps)), np.float)
+    ap = np.zeros((len(minRegionSizes), len(args.instLabels), len(overlaps)), np.float32)
 
     for dI, minRegionSize in enumerate(minRegionSizes):
         for (oI, overlapTh) in enumerate(overlaps):
diff --git a/mega_core/modeling/rpn/anchor_generator.py b/mega_core/modeling/rpn/anchor_generator.py
index ec275ef..eda5d1d 100644
--- a/mega_core/modeling/rpn/anchor_generator.py
+++ b/mega_core/modeling/rpn/anchor_generator.py
@@ -226,8 +226,8 @@ def generate_anchors(
     """
     return _generate_anchors(
         stride,
-        np.array(sizes, dtype=np.float) / stride,
-        np.array(aspect_ratios, dtype=np.float),
+        np.array(sizes, dtype=np.float32) / stride,
+        np.array(aspect_ratios, dtype=np.float32),
     )
 
 
@@ -235,7 +235,7 @@ def _generate_anchors(base_size, scales, aspect_ratios):
     """Generate anchor (reference) windows by enumerating aspect ratios X
     scales wrt a reference (0, 0, base_size - 1, base_size - 1) window.
     """
-    anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1
+    anchor = np.array([1, 1, base_size, base_size], dtype=np.float32) - 1
     anchors = _ratio_enum(anchor, aspect_ratios)
     anchors = np.vstack(
         [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])]
diff --git a/mega_core/structures/bounding_box.py b/mega_core/structures/bounding_box.py
index 25791d5..db4b980 100644
--- a/mega_core/structures/bounding_box.py
+++ b/mega_core/structures/bounding_box.py
@@ -213,6 +213,7 @@ def __len__(self):
 
     def clip_to_image(self, remove_empty=True):
         TO_REMOVE = 1
+        self.bbox = self.bbox.clone()
         self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE)
         self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE)
         self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE)
diff --git a/mega_core/structures/segmentation_mask.py b/mega_core/structures/segmentation_mask.py
index ab2c17e..598de40 100644
--- a/mega_core/structures/segmentation_mask.py
+++ b/mega_core/structures/segmentation_mask.py
@@ -178,11 +178,11 @@ def _findContours(self):
 
             reshaped_contour = []
             for entity in contour:
-                assert len(entity.shape) == 3
+                assert len(entity.get().shape) == 3
                 assert (
-                    entity.shape[1] == 1
+                    entity.get().shape[1] == 1
                 ), "Hierarchical contours are not allowed"
-                reshaped_contour.append(entity.reshape(-1).tolist())
+                reshaped_contour.append(entity.get().reshape(-1).tolist())
             contours.append(reshaped_contour)
         return contours
 
diff --git a/mega_core/utils/c2_model_loading.py b/mega_core/utils/c2_model_loading.py
index de4094e..297d11f 100644
--- a/mega_core/utils/c2_model_loading.py
+++ b/mega_core/utils/c2_model_loading.py
@@ -132,14 +132,8 @@ def _rename_weights_for_resnet(weights, stage_names):
 
 def _load_c2_pickled_weights(file_path):
     with open(file_path, "rb") as f:
-        if torch._six.PY3:
-            data = pickle.load(f, encoding="latin1")
-        else:
-            data = pickle.load(f)
-    if "blobs" in data:
-        weights = data["blobs"]
-    else:
-        weights = data
+        data = pickle.load(f, encoding="latin1")
+    weights = data["blobs"] if "blobs" in data else data
     return weights
 
 
diff --git a/mega_core/utils/distributed.py b/mega_core/utils/distributed.py
index 4988f33..3c2f1d1 100644
--- a/mega_core/utils/distributed.py
+++ b/mega_core/utils/distributed.py
@@ -66,8 +66,7 @@ def get_gpus_nocache():
     """
     cmds = 'nvidia-smi --query-gpu=name --format=csv,noheader'.split(' ')
     with run_and_terminate_process(
-            cmds, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-            bufsize=1) as process:
+            cmds, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as process:
         return [
             str(line).strip() for line in iter(process.stdout.readline, b'')
         ]
diff --git a/mega_core/utils/imports.py b/mega_core/utils/imports.py
index 53e27e2..59c1952 100644
--- a/mega_core/utils/imports.py
+++ b/mega_core/utils/imports.py
@@ -1,23 +1,13 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-import torch
-
-if torch._six.PY3:
-    import importlib
-    import importlib.util
-    import sys
-
-
-    # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
-    def import_file(module_name, file_path, make_importable=False):
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        if make_importable:
-            sys.modules[module_name] = module
-        return module
-else:
-    import imp
-
-    def import_file(module_name, file_path, make_importable=None):
-        module = imp.load_source(module_name, file_path)
-        return module
+import importlib
+import importlib.util
+import sys
+
+
+def import_file(module_name, file_path, make_importable=False):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    if make_importable:
+        sys.modules[module_name] = module
+    return module
diff --git a/mega_core/utils/model_zoo.py b/mega_core/utils/model_zoo.py
index 374b095..e160869 100644
--- a/mega_core/utils/model_zoo.py
+++ b/mega_core/utils/model_zoo.py
@@ -2,14 +2,9 @@
 import os
 import sys
 
-try:
-    from torch.hub import _download_url_to_file
-    from torch.hub import urlparse
-    from torch.hub import HASH_REGEX
-except ImportError:
-    from torch.utils.model_zoo import _download_url_to_file
-    from torch.utils.model_zoo import urlparse
-    from torch.utils.model_zoo import HASH_REGEX
+from torch.hub import download_url_to_file
+from torch.hub import urlparse
+from torch.hub import HASH_REGEX
 
 from mega_core.utils.comm import is_main_process
 from mega_core.utils.comm import synchronize
@@ -56,6 +51,6 @@ def cache_url(url, model_dir=None, progress=True):
             # if the hash_prefix is less than 6 characters
             if len(hash_prefix) < 6:
                 hash_prefix = None
-        _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
+        download_url_to_file(url, cached_file, hash_prefix, progress=progress)
     synchronize()
     return cached_file
diff --git a/tests/test_backbones.py b/tests/test_backbones.py
index a584c0a..c9847d6 100644
--- a/tests/test_backbones.py
+++ b/tests/test_backbones.py
@@ -12,11 +12,11 @@
 
 # overwrite configs if specified, otherwise default config is used
 BACKBONE_CFGS = {
-    "R-50-FPN": "e2e_faster_rcnn_R_50_FPN_1x.yaml",
-    "R-101-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml",
-    "R-152-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml",
-    "R-50-FPN-RETINANET": "retinanet/retinanet_R-50-FPN_1x.yaml",
-    "R-101-FPN-RETINANET": "retinanet/retinanet_R-101-FPN_1x.yaml",
+    "R-50-FPN": "official_configs/e2e_faster_rcnn_R_50_FPN_1x.yaml",
+    "R-101-FPN": "official_configs/e2e_faster_rcnn_R_101_FPN_1x.yaml",
+    "R-152-FPN": "official_configs/e2e_faster_rcnn_R_101_FPN_1x.yaml",
+    "R-50-FPN-RETINANET": "official_configs/retinanet/retinanet_R-50-FPN_1x.yaml",
+    "R-101-FPN-RETINANET": "official_configs/retinanet/retinanet_R-101-FPN_1x.yaml",
 }
 
 
diff --git a/tests/test_detectors.py b/tests/test_detectors.py
index f6d5a33..9e8ed8e 100644
--- a/tests/test_detectors.py
+++ b/tests/test_detectors.py
@@ -12,30 +12,30 @@
 
 CONFIG_FILES = [
     # bbox
-    "e2e_faster_rcnn_R_50_C4_1x.yaml",
-    "e2e_faster_rcnn_R_50_FPN_1x.yaml",
-    "e2e_faster_rcnn_fbnet.yaml",
+    "official_configs/e2e_faster_rcnn_R_50_C4_1x.yaml",
+    "official_configs/e2e_faster_rcnn_R_50_FPN_1x.yaml",
+    "official_configs/e2e_faster_rcnn_fbnet.yaml",
 
     # mask
-    "e2e_mask_rcnn_R_50_C4_1x.yaml",
-    "e2e_mask_rcnn_R_50_FPN_1x.yaml",
-    "e2e_mask_rcnn_fbnet.yaml",
+    "official_configs/e2e_mask_rcnn_R_50_C4_1x.yaml",
+    "official_configs/e2e_mask_rcnn_R_50_FPN_1x.yaml",
+    "official_configs/e2e_mask_rcnn_fbnet.yaml",
 
     # keypoints
     # TODO: fail to run for random model due to empty head input
     # "e2e_keypoint_rcnn_R_50_FPN_1x.yaml",
 
     # gn
-    "gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml",
+    "official_configs/gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml",
     # TODO: fail to run for random model due to empty head input
     # "gn_baselines/e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml",
-	
+
     # retinanet
-    "retinanet/retinanet_R-50-FPN_1x.yaml",
+    "official_configs/retinanet/retinanet_R-50-FPN_1x.yaml",
 
     # rpn only
-    "rpn_R_50_C4_1x.yaml",
-    "rpn_R_50_FPN_1x.yaml",
+    "official_configs/rpn_R_50_C4_1x.yaml",
+    "official_configs/rpn_R_50_FPN_1x.yaml",
 ]
 
 EXCLUDED_FOLDERS = [
diff --git a/tests/test_feature_extractors.py b/tests/test_feature_extractors.py
index e9149ed..30089ad 100644
--- a/tests/test_feature_extractors.py
+++ b/tests/test_feature_extractors.py
@@ -4,8 +4,8 @@
 import copy
 import torch
 # import modules to to register feature extractors
-from mega_core.modeling.backbone import build_backbone # NoQA
-from mega_core.modeling.roi_heads.roi_heads import build_roi_heads # NoQA
+from mega_core.modeling.backbone import build_backbone  # NoQA
+from mega_core.modeling.roi_heads.roi_heads import build_roi_heads  # NoQA
 from mega_core.modeling import registry
 from mega_core.structures.bounding_box import BoxList
 from mega_core.config import cfg as g_cfg
@@ -19,6 +19,7 @@
 FEATURE_EXTRACTORS_INPUT_CHANNELS = {
     # in_channels was not used, load through config
     "ResNet50Conv5ROIFeatureExtractor": 1024,
+    "ResNetConv52MLPFeatureExtractor": 1024,
 }
 
 
@@ -32,7 +33,14 @@ def _test_feature_extractors(
     in_channels_default = 64
 
     for name, builder in extractors.items():
-        print('Testing {}...'.format(name))
+        if name == "RDNFeatureExtractor" or name == "MEGAFeatureExtractor":
+            # Currently feature extraction is called as follows (see line 69):
+            # out = fe([input], [box_list] * N)
+            # But the RDNFeatureExtractor and MEGAFeatureExtractor extractors
+            # require inputs in a different form.
+            # TODO: test them separately.
+            continue
+
         if name in overwrite_cfgs:
             cfg = load_config(overwrite_cfgs[name])
         else:
@@ -42,15 +50,20 @@ def _test_feature_extractors(
         in_channels = overwrite_in_channels.get(
             name, in_channels_default)
 
+        print('Testing {} with {} channels...'.format(name, in_channels))
+
         fe = builder(cfg, in_channels)
         self.assertIsNotNone(
             getattr(fe, 'out_channels', None),
-            'Need to provide out_channels for feature extractor {}'.format(name)
+            'Need to provide out_channels for feature extractor {}'.format(
+                name)
         )
 
         N, C_in, H, W = 2, in_channels, 24, 32
         input = torch.rand([N, C_in, H, W], dtype=torch.float32)
         bboxes = [[1, 1, 10, 10], [5, 5, 8, 8], [2, 2, 3, 4]]
+        # Note: img_size = [384, 512] requires 1024 in_channels, therefore we
+        # override the defaults in FEATURE_EXTRACTORS_INPUT_CHANNELS
         img_size = [384, 512]
         box_list = BoxList(bboxes, img_size, "xyxy")
         out = fe([input], [box_list] * N)